implemented changes from #9497: sort abstracts by string length, included author fullnames in the related results, expanded instance details within each children/result XML element

This commit is contained in:
Claudio Atzori 2024-03-14 16:21:23 +01:00
parent 7863c92466
commit af154d4456
6 changed files with 118 additions and 32 deletions

View File

@ -506,6 +506,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(GraphCleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.sorted((s1, s2) -> s2.getValue().length() - s1.getValue().length())
.limit(ModelHardLimits.MAX_ABSTRACTS)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {

View File

@ -10,6 +10,7 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -27,14 +28,7 @@ import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import scala.Tuple2; import scala.Tuple2;
@ -156,16 +150,33 @@ public class CreateRelatedEntitiesJob_phase1 {
case software: case software:
final Result result = (Result) entity; final Result result = (Result) entity;
if (result.getTitle() != null && !result.getTitle().isEmpty()) { if (Objects.nonNull(result.getTitle()) && !result.getTitle().isEmpty()) {
final StructuredProperty title = result.getTitle().stream().findFirst().get(); result
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH)); .getTitle()
re.setTitle(title); .stream()
.findFirst()
.map(StructuredProperty::getValue)
.ifPresent(
title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
} }
if (result.getDescription() != null && !result.getDescription().isEmpty()) { if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
final Field<String> description = result.getDescription().stream().findFirst().get(); result
if (StringUtils.isNotBlank(description.getValue())) { .getDescription()
re.setDescription(StringUtils.left(description.getValue(), ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)); .stream()
.findFirst()
.map(Field::getValue)
.ifPresent(
d -> re.setDescription(StringUtils.left(d, ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)));
} }
if (Objects.nonNull(result.getAuthor()) && !result.getAuthor().isEmpty()) {
re
.setAuthor(
result
.getAuthor()
.stream()
.map(Author::getFullname)
.filter(StringUtils::isNotBlank)
.collect(Collectors.toList()));
} }
re.setDateofacceptance(getValue(result.getDateofacceptance())); re.setDateofacceptance(getValue(result.getDateofacceptance()));

View File

@ -63,32 +63,32 @@ public class ProvisionModelSupport {
} }
public static SolrRecord transform(JoinedEntity je, ContextMapper contextMapper, VocabularyGroup vocs) { public static SolrRecord transform(JoinedEntity je, ContextMapper contextMapper, VocabularyGroup vocs) {
SolrRecord record = new SolrRecord(); SolrRecord r = new SolrRecord();
final OafEntity e = je.getEntity(); final OafEntity e = je.getEntity();
final RecordType type = RecordType.valueOf(e.getClass().getSimpleName().toLowerCase()); final RecordType type = RecordType.valueOf(e.getClass().getSimpleName().toLowerCase());
final Boolean deletedbyinference = Optional final Boolean deletedbyinference = Optional
.ofNullable(e.getDataInfo()) .ofNullable(e.getDataInfo())
.map(DataInfo::getDeletedbyinference) .map(DataInfo::getDeletedbyinference)
.orElse(null); .orElse(null);
record r
.setHeader( .setHeader(
SolrRecordHeader SolrRecordHeader
.newInstance( .newInstance(
e.getId(), e.getOriginalId(), type, deletedbyinference)); e.getId(), e.getOriginalId(), type, deletedbyinference));
record.setCollectedfrom(asProvenance(e.getCollectedfrom())); r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
record.setContext(asContext(e.getContext(), contextMapper)); r.setContext(asContext(e.getContext(), contextMapper));
record.setPid(asPid(e.getPid())); r.setPid(asPid(e.getPid()));
if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) { if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) {
record.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e)); r.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e));
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Datasource) { } else if (e instanceof eu.dnetlib.dhp.schema.oaf.Datasource) {
record.setDatasource(mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) e)); r.setDatasource(mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) e));
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Organization) { } else if (e instanceof eu.dnetlib.dhp.schema.oaf.Organization) {
record.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e)); r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) { } else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) {
record.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs)); r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
} }
record r
.setLinks( .setLinks(
Optional Optional
.ofNullable(je.getLinks()) .ofNullable(je.getLinks())
@ -99,7 +99,7 @@ public class ProvisionModelSupport {
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElse(null)); .orElse(null));
return record; return r;
} }
private static RelatedRecord mapRelatedRecord(RelatedEntityWrapper rew, VocabularyGroup vocs) { private static RelatedRecord mapRelatedRecord(RelatedEntityWrapper rew, VocabularyGroup vocs) {

View File

@ -24,6 +24,7 @@ public class RelatedEntity implements Serializable {
// results // results
private String description; private String description;
private List<String> author;
private String dateofacceptance; private String dateofacceptance;
private String publisher; private String publisher;
private List<StructuredProperty> pid; private List<StructuredProperty> pid;
@ -86,6 +87,14 @@ public class RelatedEntity implements Serializable {
this.description = description; this.description = description;
} }
public List<String> getAuthor() {
return author;
}
public void setAuthor(List<String> author) {
this.author = author;
}
public void setWebsiteurl(String websiteurl) { public void setWebsiteurl(String websiteurl) {
this.websiteurl = websiteurl; this.websiteurl = websiteurl;
} }

View File

@ -100,13 +100,17 @@ public class TemplateFactory {
public String getInstance( public String getInstance(
final List<String> instancemetadata, final String url) { final List<String> instancemetadata, final String url) {
return getInstance(instancemetadata, Lists.newArrayList(url));
}
public String getInstance(
final List<String> instancemetadata, final List<String> url) {
return getTemplate(resources.getInstance()) return getTemplate(resources.getInstance())
.add("metadata", instancemetadata) .add("metadata", instancemetadata)
.add( .add(
"webresources", "webresources",
Optional Optional
.ofNullable(url) .ofNullable(url)
.map(u -> Lists.newArrayList(url))
.orElse(Lists.newArrayList()) .orElse(Lists.newArrayList())
.stream() .stream()
.filter(StringUtils::isNotBlank) .filter(StringUtils::isNotBlank)

View File

@ -49,6 +49,7 @@ import eu.dnetlib.dhp.schema.common.*;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import scala.Tuple2; import scala.Tuple2;
public class XmlRecordFactory implements Serializable { public class XmlRecordFactory implements Serializable {
@ -365,6 +366,7 @@ public class XmlRecordFactory implements Serializable {
.getDescription() .getDescription()
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.limit(ModelHardLimits.MAX_ABSTRACTS)
.map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue()))
.collect(Collectors.toCollection(HashSet::new))); .collect(Collectors.toCollection(HashSet::new)));
} }
@ -1057,7 +1059,8 @@ public class XmlRecordFactory implements Serializable {
return kv != null && StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue()); return kv != null && StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue());
} }
private List<String> mapFields(final RelatedEntityWrapper link, final Set<String> contexts) { private List<String> mapFields(final TemplateFactory templateFactory, final RelatedEntityWrapper link,
final Set<String> contexts) {
final Relation rel = link.getRelation(); final Relation rel = link.getRelation();
final RelatedEntity re = link.getTarget(); final RelatedEntity re = link.getTarget();
final String targetType = link.getTarget().getType(); final String targetType = link.getTarget().getType();
@ -1074,6 +1077,15 @@ public class XmlRecordFactory implements Serializable {
if (StringUtils.isNotBlank(re.getDescription())) { if (StringUtils.isNotBlank(re.getDescription())) {
metadata.add(XmlSerializationUtils.asXmlElement("description", re.getDescription())); metadata.add(XmlSerializationUtils.asXmlElement("description", re.getDescription()));
} }
if (re.getAuthor() != null) {
metadata
.addAll(
re
.getAuthor()
.stream()
.map(author -> XmlSerializationUtils.asXmlElement("creator", author))
.collect(Collectors.toList()));
}
if (isNotBlank(re.getDateofacceptance())) { if (isNotBlank(re.getDateofacceptance())) {
metadata metadata
.add(XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance())); .add(XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance()));
@ -1107,6 +1119,54 @@ public class XmlRecordFactory implements Serializable {
.map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (re.getInstances() != null) {
re
.getInstances()
.forEach(i -> {
final List<String> instanceFields = Lists.newArrayList();
if (i.getAccessright() != null && !i.getAccessright().isBlank()) {
instanceFields
.add(XmlSerializationUtils.mapQualifier("accessright", i.getAccessright()));
}
if (i.getHostedby() != null) {
instanceFields.add(XmlSerializationUtils.mapKeyValue("hostedby", i.getHostedby()));
}
if (i.getDateofacceptance() != null && isNotBlank(i.getDateofacceptance().getValue())) {
instanceFields
.add(
XmlSerializationUtils
.asXmlElement("dateofacceptance", i.getDateofacceptance().getValue()));
}
if (i.getInstancetype() != null && !i.getInstancetype().isBlank()) {
instanceFields
.add(XmlSerializationUtils.mapQualifier("instancetype", i.getInstancetype()));
}
if (i.getRefereed() != null && !i.getRefereed().isBlank()) {
instanceFields.add(XmlSerializationUtils.mapQualifier("refereed", i.getRefereed()));
}
if (i.getLicense() != null && isNotBlank(i.getLicense().getValue())) {
instanceFields
.add(XmlSerializationUtils.asXmlElement("license", i.getLicense().getValue()));
}
if (isNotBlank(i.getFulltext())) {
instanceFields.add(XmlSerializationUtils.asXmlElement("fulltext", i.getFulltext()));
}
if (i.getUrl() != null && !i.getUrl().isEmpty()) {
instanceFields
.addAll(
i
.getUrl()
.stream()
.filter(StringUtils::isNotBlank)
.map(url -> XmlSerializationUtils.asXmlElement("url", url))
.collect(Collectors.toList()));
}
metadata.add(templateFactory.getInstance(instanceFields, i.getUrl()));
});
}
break; break;
case datasource: case datasource:
if (isNotBlank(re.getOfficialname())) { if (isNotBlank(re.getOfficialname())) {
@ -1188,7 +1248,7 @@ public class XmlRecordFactory implements Serializable {
throw new IllegalArgumentException( throw new IllegalArgumentException(
String.format("missing scheme for: <%s - %s>", type, targetType)); String.format("missing scheme for: <%s - %s>", type, targetType));
} }
final HashSet<String> fields = Sets.newHashSet(mapFields(link, contexts)); final HashSet<String> fields = Sets.newHashSet(mapFields(templateFactory, link, contexts));
if (rel.getValidated() == null) { if (rel.getValidated() == null) {
rel.setValidated(false); rel.setValidated(false);
} }
@ -1212,7 +1272,7 @@ public class XmlRecordFactory implements Serializable {
.map(link -> { .map(link -> {
final String targetType = link.getTarget().getType(); final String targetType = link.getTarget().getType();
final String name = ModelSupport.getMainType(EntityType.valueOf(targetType)); final String name = ModelSupport.getMainType(EntityType.valueOf(targetType));
final HashSet<String> fields = Sets.newHashSet(mapFields(link, null)); final HashSet<String> fields = Sets.newHashSet(mapFields(templateFactory, link, null));
return templateFactory return templateFactory
.getChild(name, link.getTarget().getId(), Lists.newArrayList(fields)); .getChild(name, link.getTarget().getId(), Lists.newArrayList(fields));
}) })