From af154d44566af258fd00b832f23f9d1c0a6f7503 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 14 Mar 2024 16:21:23 +0100 Subject: [PATCH] implemented changes from #9497: sort abstracts by string length, included author fullnames in the related results, expanded instance details within each children/result XML element --- .../oaf/utils/GraphCleaningFunctions.java | 2 + .../CreateRelatedEntitiesJob_phase1.java | 45 ++++++++----- .../model/ProvisionModelSupport.java | 22 +++---- .../dhp/oa/provision/model/RelatedEntity.java | 9 +++ .../oa/provision/utils/TemplateFactory.java | 6 +- .../oa/provision/utils/XmlRecordFactory.java | 66 ++++++++++++++++++- 6 files changed, 118 insertions(+), 32 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index f01f90fe4f..9386db9339 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -506,6 +506,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(Objects::nonNull) .filter(sp -> StringUtils.isNotBlank(sp.getValue())) .map(GraphCleaningFunctions::cleanValue) + .sorted((s1, s2) -> s2.getValue().length() - s1.getValue().length()) + .limit(ModelHardLimits.MAX_ABSTRACTS) .collect(Collectors.toList())); } if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index 8cbda09031..9084f3ae3b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -10,6 +10,7 @@ import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.yarn.webapp.hamlet.Hamlet; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -27,14 +28,7 @@ import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; import eu.dnetlib.dhp.schema.common.EntityType; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; import scala.Tuple2; @@ -156,16 +150,33 @@ public class CreateRelatedEntitiesJob_phase1 { case software: final Result result = (Result) entity; - if (result.getTitle() != null && !result.getTitle().isEmpty()) { - final StructuredProperty title = result.getTitle().stream().findFirst().get(); - title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH)); - re.setTitle(title); + if (Objects.nonNull(result.getTitle()) && !result.getTitle().isEmpty()) { + result + .getTitle() + .stream() + .findFirst() + .map(StructuredProperty::getValue) + .ifPresent( + title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH))); } - if (result.getDescription() != null && !result.getDescription().isEmpty()) { - final Field description = result.getDescription().stream().findFirst().get(); - if (StringUtils.isNotBlank(description.getValue())) { - re.setDescription(StringUtils.left(description.getValue(), ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)); - } + if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) { + result + .getDescription() + .stream() + .findFirst() + .map(Field::getValue) + .ifPresent( + d -> re.setDescription(StringUtils.left(d, ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH))); + } + if (Objects.nonNull(result.getAuthor()) && !result.getAuthor().isEmpty()) { + re + .setAuthor( + result + .getAuthor() + .stream() + .map(Author::getFullname) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toList())); } re.setDateofacceptance(getValue(result.getDateofacceptance())); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 7705c62c0e..0e6e95de58 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -63,32 +63,32 @@ public class ProvisionModelSupport { } public static SolrRecord transform(JoinedEntity je, ContextMapper contextMapper, VocabularyGroup vocs) { - SolrRecord record = new SolrRecord(); + SolrRecord r = new SolrRecord(); final OafEntity e = je.getEntity(); final RecordType type = RecordType.valueOf(e.getClass().getSimpleName().toLowerCase()); final Boolean deletedbyinference = Optional .ofNullable(e.getDataInfo()) .map(DataInfo::getDeletedbyinference) .orElse(null); - record + r .setHeader( SolrRecordHeader .newInstance( e.getId(), e.getOriginalId(), type, deletedbyinference)); - record.setCollectedfrom(asProvenance(e.getCollectedfrom())); - record.setContext(asContext(e.getContext(), contextMapper)); - record.setPid(asPid(e.getPid())); + r.setCollectedfrom(asProvenance(e.getCollectedfrom())); + r.setContext(asContext(e.getContext(), contextMapper)); + r.setPid(asPid(e.getPid())); if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) { - record.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e)); + r.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e)); } else if (e instanceof eu.dnetlib.dhp.schema.oaf.Datasource) { - record.setDatasource(mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) e)); + r.setDatasource(mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) e)); } else if (e instanceof eu.dnetlib.dhp.schema.oaf.Organization) { - record.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e)); + r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e)); } else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) { - record.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs)); + r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs)); } - record + r .setLinks( Optional .ofNullable(je.getLinks()) @@ -99,7 +99,7 @@ public class ProvisionModelSupport { .collect(Collectors.toList())) .orElse(null)); - return record; + return r; } private static RelatedRecord mapRelatedRecord(RelatedEntityWrapper rew, VocabularyGroup vocs) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java index 12540aba08..ee010910c0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java @@ -24,6 +24,7 @@ public class RelatedEntity implements Serializable { // results private String description; + private List author; private String dateofacceptance; private String publisher; private List pid; @@ -86,6 +87,14 @@ public class RelatedEntity implements Serializable { this.description = description; } + public List getAuthor() { + return author; + } + + public void setAuthor(List author) { + this.author = author; + } + public void setWebsiteurl(String websiteurl) { this.websiteurl = websiteurl; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java index 87c0261ac0..befebe0bb7 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java @@ -100,13 +100,17 @@ public class TemplateFactory { public String getInstance( final List instancemetadata, final String url) { + return getInstance(instancemetadata, Lists.newArrayList(url)); + } + + public String getInstance( + final List instancemetadata, final List url) { return getTemplate(resources.getInstance()) .add("metadata", instancemetadata) .add( "webresources", Optional .ofNullable(url) - .map(u -> Lists.newArrayList(url)) .orElse(Lists.newArrayList()) .stream() .filter(StringUtils::isNotBlank) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index 304171399c..e1f5addfd0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -49,6 +49,7 @@ import eu.dnetlib.dhp.schema.common.*; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; +import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; import scala.Tuple2; public class XmlRecordFactory implements Serializable { @@ -365,6 +366,7 @@ public class XmlRecordFactory implements Serializable { .getDescription() .stream() .filter(Objects::nonNull) + .limit(ModelHardLimits.MAX_ABSTRACTS) .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) .collect(Collectors.toCollection(HashSet::new))); } @@ -1057,7 +1059,8 @@ public class XmlRecordFactory implements Serializable { return kv != null && StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue()); } - private List mapFields(final RelatedEntityWrapper link, final Set contexts) { + private List mapFields(final TemplateFactory templateFactory, final RelatedEntityWrapper link, + final Set contexts) { final Relation rel = link.getRelation(); final RelatedEntity re = link.getTarget(); final String targetType = link.getTarget().getType(); @@ -1074,6 +1077,15 @@ public class XmlRecordFactory implements Serializable { if (StringUtils.isNotBlank(re.getDescription())) { metadata.add(XmlSerializationUtils.asXmlElement("description", re.getDescription())); } + if (re.getAuthor() != null) { + metadata + .addAll( + re + .getAuthor() + .stream() + .map(author -> XmlSerializationUtils.asXmlElement("creator", author)) + .collect(Collectors.toList())); + } if (isNotBlank(re.getDateofacceptance())) { metadata .add(XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance())); @@ -1107,6 +1119,54 @@ public class XmlRecordFactory implements Serializable { .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) .collect(Collectors.toList())); } + if (re.getInstances() != null) { + re + .getInstances() + .forEach(i -> { + final List instanceFields = Lists.newArrayList(); + if (i.getAccessright() != null && !i.getAccessright().isBlank()) { + instanceFields + .add(XmlSerializationUtils.mapQualifier("accessright", i.getAccessright())); + } + if (i.getHostedby() != null) { + instanceFields.add(XmlSerializationUtils.mapKeyValue("hostedby", i.getHostedby())); + } + if (i.getDateofacceptance() != null && isNotBlank(i.getDateofacceptance().getValue())) { + instanceFields + .add( + XmlSerializationUtils + .asXmlElement("dateofacceptance", i.getDateofacceptance().getValue())); + } + if (i.getInstancetype() != null && !i.getInstancetype().isBlank()) { + instanceFields + .add(XmlSerializationUtils.mapQualifier("instancetype", i.getInstancetype())); + } + + if (i.getRefereed() != null && !i.getRefereed().isBlank()) { + instanceFields.add(XmlSerializationUtils.mapQualifier("refereed", i.getRefereed())); + } + + if (i.getLicense() != null && isNotBlank(i.getLicense().getValue())) { + instanceFields + .add(XmlSerializationUtils.asXmlElement("license", i.getLicense().getValue())); + } + if (isNotBlank(i.getFulltext())) { + instanceFields.add(XmlSerializationUtils.asXmlElement("fulltext", i.getFulltext())); + } + if (i.getUrl() != null && !i.getUrl().isEmpty()) { + instanceFields + .addAll( + i + .getUrl() + .stream() + .filter(StringUtils::isNotBlank) + .map(url -> XmlSerializationUtils.asXmlElement("url", url)) + .collect(Collectors.toList())); + } + metadata.add(templateFactory.getInstance(instanceFields, i.getUrl())); + }); + } + break; case datasource: if (isNotBlank(re.getOfficialname())) { @@ -1188,7 +1248,7 @@ public class XmlRecordFactory implements Serializable { throw new IllegalArgumentException( String.format("missing scheme for: <%s - %s>", type, targetType)); } - final HashSet fields = Sets.newHashSet(mapFields(link, contexts)); + final HashSet fields = Sets.newHashSet(mapFields(templateFactory, link, contexts)); if (rel.getValidated() == null) { rel.setValidated(false); } @@ -1212,7 +1272,7 @@ public class XmlRecordFactory implements Serializable { .map(link -> { final String targetType = link.getTarget().getType(); final String name = ModelSupport.getMainType(EntityType.valueOf(targetType)); - final HashSet fields = Sets.newHashSet(mapFields(link, null)); + final HashSet fields = Sets.newHashSet(mapFields(templateFactory, link, null)); return templateFactory .getChild(name, link.getTarget().getId(), Lists.newArrayList(fields)); })