implemented changes from #9497: sort abstracts by string length, included author fullnames in the related results, expanded instance details within each children/result XML element

This commit is contained in:
Claudio Atzori 2024-03-14 16:21:23 +01:00
parent 7863c92466
commit af154d4456
6 changed files with 118 additions and 32 deletions

View File

@ -506,6 +506,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(GraphCleaningFunctions::cleanValue)
.sorted((s1, s2) -> s2.getValue().length() - s1.getValue().length())
.limit(ModelHardLimits.MAX_ABSTRACTS)
.collect(Collectors.toList()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {

View File

@ -10,6 +10,7 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -27,14 +28,7 @@ import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import scala.Tuple2;
@ -156,16 +150,33 @@ public class CreateRelatedEntitiesJob_phase1 {
case software:
final Result result = (Result) entity;
if (result.getTitle() != null && !result.getTitle().isEmpty()) {
final StructuredProperty title = result.getTitle().stream().findFirst().get();
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
re.setTitle(title);
if (Objects.nonNull(result.getTitle()) && !result.getTitle().isEmpty()) {
result
.getTitle()
.stream()
.findFirst()
.map(StructuredProperty::getValue)
.ifPresent(
title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
}
if (result.getDescription() != null && !result.getDescription().isEmpty()) {
final Field<String> description = result.getDescription().stream().findFirst().get();
if (StringUtils.isNotBlank(description.getValue())) {
re.setDescription(StringUtils.left(description.getValue(), ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH));
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
result
.getDescription()
.stream()
.findFirst()
.map(Field::getValue)
.ifPresent(
d -> re.setDescription(StringUtils.left(d, ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)));
}
if (Objects.nonNull(result.getAuthor()) && !result.getAuthor().isEmpty()) {
re
.setAuthor(
result
.getAuthor()
.stream()
.map(Author::getFullname)
.filter(StringUtils::isNotBlank)
.collect(Collectors.toList()));
}
re.setDateofacceptance(getValue(result.getDateofacceptance()));

View File

@ -63,32 +63,32 @@ public class ProvisionModelSupport {
}
public static SolrRecord transform(JoinedEntity je, ContextMapper contextMapper, VocabularyGroup vocs) {
SolrRecord record = new SolrRecord();
SolrRecord r = new SolrRecord();
final OafEntity e = je.getEntity();
final RecordType type = RecordType.valueOf(e.getClass().getSimpleName().toLowerCase());
final Boolean deletedbyinference = Optional
.ofNullable(e.getDataInfo())
.map(DataInfo::getDeletedbyinference)
.orElse(null);
record
r
.setHeader(
SolrRecordHeader
.newInstance(
e.getId(), e.getOriginalId(), type, deletedbyinference));
record.setCollectedfrom(asProvenance(e.getCollectedfrom()));
record.setContext(asContext(e.getContext(), contextMapper));
record.setPid(asPid(e.getPid()));
r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
r.setContext(asContext(e.getContext(), contextMapper));
r.setPid(asPid(e.getPid()));
if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) {
record.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e));
r.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e));
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Datasource) {
record.setDatasource(mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) e));
r.setDatasource(mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) e));
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Organization) {
record.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) {
record.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
}
record
r
.setLinks(
Optional
.ofNullable(je.getLinks())
@ -99,7 +99,7 @@ public class ProvisionModelSupport {
.collect(Collectors.toList()))
.orElse(null));
return record;
return r;
}
private static RelatedRecord mapRelatedRecord(RelatedEntityWrapper rew, VocabularyGroup vocs) {

View File

@ -24,6 +24,7 @@ public class RelatedEntity implements Serializable {
// results
private String description;
private List<String> author;
private String dateofacceptance;
private String publisher;
private List<StructuredProperty> pid;
@ -86,6 +87,14 @@ public class RelatedEntity implements Serializable {
this.description = description;
}
public List<String> getAuthor() {
return author;
}
public void setAuthor(List<String> author) {
this.author = author;
}
public void setWebsiteurl(String websiteurl) {
this.websiteurl = websiteurl;
}

View File

@ -100,13 +100,17 @@ public class TemplateFactory {
public String getInstance(
final List<String> instancemetadata, final String url) {
return getInstance(instancemetadata, Lists.newArrayList(url));
}
public String getInstance(
final List<String> instancemetadata, final List<String> url) {
return getTemplate(resources.getInstance())
.add("metadata", instancemetadata)
.add(
"webresources",
Optional
.ofNullable(url)
.map(u -> Lists.newArrayList(url))
.orElse(Lists.newArrayList())
.stream()
.filter(StringUtils::isNotBlank)

View File

@ -49,6 +49,7 @@ import eu.dnetlib.dhp.schema.common.*;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import scala.Tuple2;
public class XmlRecordFactory implements Serializable {
@ -365,6 +366,7 @@ public class XmlRecordFactory implements Serializable {
.getDescription()
.stream()
.filter(Objects::nonNull)
.limit(ModelHardLimits.MAX_ABSTRACTS)
.map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue()))
.collect(Collectors.toCollection(HashSet::new)));
}
@ -1057,7 +1059,8 @@ public class XmlRecordFactory implements Serializable {
return kv != null && StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue());
}
private List<String> mapFields(final RelatedEntityWrapper link, final Set<String> contexts) {
private List<String> mapFields(final TemplateFactory templateFactory, final RelatedEntityWrapper link,
final Set<String> contexts) {
final Relation rel = link.getRelation();
final RelatedEntity re = link.getTarget();
final String targetType = link.getTarget().getType();
@ -1074,6 +1077,15 @@ public class XmlRecordFactory implements Serializable {
if (StringUtils.isNotBlank(re.getDescription())) {
metadata.add(XmlSerializationUtils.asXmlElement("description", re.getDescription()));
}
if (re.getAuthor() != null) {
metadata
.addAll(
re
.getAuthor()
.stream()
.map(author -> XmlSerializationUtils.asXmlElement("creator", author))
.collect(Collectors.toList()));
}
if (isNotBlank(re.getDateofacceptance())) {
metadata
.add(XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance()));
@ -1107,6 +1119,54 @@ public class XmlRecordFactory implements Serializable {
.map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p))
.collect(Collectors.toList()));
}
if (re.getInstances() != null) {
re
.getInstances()
.forEach(i -> {
final List<String> instanceFields = Lists.newArrayList();
if (i.getAccessright() != null && !i.getAccessright().isBlank()) {
instanceFields
.add(XmlSerializationUtils.mapQualifier("accessright", i.getAccessright()));
}
if (i.getHostedby() != null) {
instanceFields.add(XmlSerializationUtils.mapKeyValue("hostedby", i.getHostedby()));
}
if (i.getDateofacceptance() != null && isNotBlank(i.getDateofacceptance().getValue())) {
instanceFields
.add(
XmlSerializationUtils
.asXmlElement("dateofacceptance", i.getDateofacceptance().getValue()));
}
if (i.getInstancetype() != null && !i.getInstancetype().isBlank()) {
instanceFields
.add(XmlSerializationUtils.mapQualifier("instancetype", i.getInstancetype()));
}
if (i.getRefereed() != null && !i.getRefereed().isBlank()) {
instanceFields.add(XmlSerializationUtils.mapQualifier("refereed", i.getRefereed()));
}
if (i.getLicense() != null && isNotBlank(i.getLicense().getValue())) {
instanceFields
.add(XmlSerializationUtils.asXmlElement("license", i.getLicense().getValue()));
}
if (isNotBlank(i.getFulltext())) {
instanceFields.add(XmlSerializationUtils.asXmlElement("fulltext", i.getFulltext()));
}
if (i.getUrl() != null && !i.getUrl().isEmpty()) {
instanceFields
.addAll(
i
.getUrl()
.stream()
.filter(StringUtils::isNotBlank)
.map(url -> XmlSerializationUtils.asXmlElement("url", url))
.collect(Collectors.toList()));
}
metadata.add(templateFactory.getInstance(instanceFields, i.getUrl()));
});
}
break;
case datasource:
if (isNotBlank(re.getOfficialname())) {
@ -1188,7 +1248,7 @@ public class XmlRecordFactory implements Serializable {
throw new IllegalArgumentException(
String.format("missing scheme for: <%s - %s>", type, targetType));
}
final HashSet<String> fields = Sets.newHashSet(mapFields(link, contexts));
final HashSet<String> fields = Sets.newHashSet(mapFields(templateFactory, link, contexts));
if (rel.getValidated() == null) {
rel.setValidated(false);
}
@ -1212,7 +1272,7 @@ public class XmlRecordFactory implements Serializable {
.map(link -> {
final String targetType = link.getTarget().getType();
final String name = ModelSupport.getMainType(EntityType.valueOf(targetType));
final HashSet<String> fields = Sets.newHashSet(mapFields(link, null));
final HashSet<String> fields = Sets.newHashSet(mapFields(templateFactory, link, null));
return templateFactory
.getChild(name, link.getTarget().getId(), Lists.newArrayList(fields));
})