forked from D-Net/dnet-hadoop
implemented changes from #9497: sort abstracts by string length, included author fullnames in the related results, expanded instance details within each children/result XML element
This commit is contained in:
parent
7863c92466
commit
af154d4456
|
@ -506,6 +506,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.sorted((s1, s2) -> s2.getValue().length() - s1.getValue().length())
|
||||
.limit(ModelHardLimits.MAX_ABSTRACTS)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||
|
|
|
@ -10,6 +10,7 @@ import java.util.stream.Collectors;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -27,14 +28,7 @@ import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
|||
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
|
||||
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||
import scala.Tuple2;
|
||||
|
||||
|
@ -156,16 +150,33 @@ public class CreateRelatedEntitiesJob_phase1 {
|
|||
case software:
|
||||
final Result result = (Result) entity;
|
||||
|
||||
if (result.getTitle() != null && !result.getTitle().isEmpty()) {
|
||||
final StructuredProperty title = result.getTitle().stream().findFirst().get();
|
||||
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||
re.setTitle(title);
|
||||
if (Objects.nonNull(result.getTitle()) && !result.getTitle().isEmpty()) {
|
||||
result
|
||||
.getTitle()
|
||||
.stream()
|
||||
.findFirst()
|
||||
.map(StructuredProperty::getValue)
|
||||
.ifPresent(
|
||||
title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
|
||||
}
|
||||
if (result.getDescription() != null && !result.getDescription().isEmpty()) {
|
||||
final Field<String> description = result.getDescription().stream().findFirst().get();
|
||||
if (StringUtils.isNotBlank(description.getValue())) {
|
||||
re.setDescription(StringUtils.left(description.getValue(), ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH));
|
||||
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
|
||||
result
|
||||
.getDescription()
|
||||
.stream()
|
||||
.findFirst()
|
||||
.map(Field::getValue)
|
||||
.ifPresent(
|
||||
d -> re.setDescription(StringUtils.left(d, ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)));
|
||||
}
|
||||
if (Objects.nonNull(result.getAuthor()) && !result.getAuthor().isEmpty()) {
|
||||
re
|
||||
.setAuthor(
|
||||
result
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.map(Author::getFullname)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
re.setDateofacceptance(getValue(result.getDateofacceptance()));
|
||||
|
|
|
@ -63,32 +63,32 @@ public class ProvisionModelSupport {
|
|||
}
|
||||
|
||||
public static SolrRecord transform(JoinedEntity je, ContextMapper contextMapper, VocabularyGroup vocs) {
|
||||
SolrRecord record = new SolrRecord();
|
||||
SolrRecord r = new SolrRecord();
|
||||
final OafEntity e = je.getEntity();
|
||||
final RecordType type = RecordType.valueOf(e.getClass().getSimpleName().toLowerCase());
|
||||
final Boolean deletedbyinference = Optional
|
||||
.ofNullable(e.getDataInfo())
|
||||
.map(DataInfo::getDeletedbyinference)
|
||||
.orElse(null);
|
||||
record
|
||||
r
|
||||
.setHeader(
|
||||
SolrRecordHeader
|
||||
.newInstance(
|
||||
e.getId(), e.getOriginalId(), type, deletedbyinference));
|
||||
record.setCollectedfrom(asProvenance(e.getCollectedfrom()));
|
||||
record.setContext(asContext(e.getContext(), contextMapper));
|
||||
record.setPid(asPid(e.getPid()));
|
||||
r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
|
||||
r.setContext(asContext(e.getContext(), contextMapper));
|
||||
r.setPid(asPid(e.getPid()));
|
||||
|
||||
if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) {
|
||||
record.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e));
|
||||
r.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e));
|
||||
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Datasource) {
|
||||
record.setDatasource(mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) e));
|
||||
r.setDatasource(mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) e));
|
||||
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Organization) {
|
||||
record.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
|
||||
r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
|
||||
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) {
|
||||
record.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
|
||||
r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
|
||||
}
|
||||
record
|
||||
r
|
||||
.setLinks(
|
||||
Optional
|
||||
.ofNullable(je.getLinks())
|
||||
|
@ -99,7 +99,7 @@ public class ProvisionModelSupport {
|
|||
.collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
return record;
|
||||
return r;
|
||||
}
|
||||
|
||||
private static RelatedRecord mapRelatedRecord(RelatedEntityWrapper rew, VocabularyGroup vocs) {
|
||||
|
|
|
@ -24,6 +24,7 @@ public class RelatedEntity implements Serializable {
|
|||
|
||||
// results
|
||||
private String description;
|
||||
private List<String> author;
|
||||
private String dateofacceptance;
|
||||
private String publisher;
|
||||
private List<StructuredProperty> pid;
|
||||
|
@ -86,6 +87,14 @@ public class RelatedEntity implements Serializable {
|
|||
this.description = description;
|
||||
}
|
||||
|
||||
public List<String> getAuthor() {
|
||||
return author;
|
||||
}
|
||||
|
||||
public void setAuthor(List<String> author) {
|
||||
this.author = author;
|
||||
}
|
||||
|
||||
public void setWebsiteurl(String websiteurl) {
|
||||
this.websiteurl = websiteurl;
|
||||
}
|
||||
|
|
|
@ -100,13 +100,17 @@ public class TemplateFactory {
|
|||
|
||||
public String getInstance(
|
||||
final List<String> instancemetadata, final String url) {
|
||||
return getInstance(instancemetadata, Lists.newArrayList(url));
|
||||
}
|
||||
|
||||
public String getInstance(
|
||||
final List<String> instancemetadata, final List<String> url) {
|
||||
return getTemplate(resources.getInstance())
|
||||
.add("metadata", instancemetadata)
|
||||
.add(
|
||||
"webresources",
|
||||
Optional
|
||||
.ofNullable(url)
|
||||
.map(u -> Lists.newArrayList(url))
|
||||
.orElse(Lists.newArrayList())
|
||||
.stream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
|
|
|
@ -49,6 +49,7 @@ import eu.dnetlib.dhp.schema.common.*;
|
|||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class XmlRecordFactory implements Serializable {
|
||||
|
@ -365,6 +366,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
.getDescription()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.limit(ModelHardLimits.MAX_ABSTRACTS)
|
||||
.map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue()))
|
||||
.collect(Collectors.toCollection(HashSet::new)));
|
||||
}
|
||||
|
@ -1057,7 +1059,8 @@ public class XmlRecordFactory implements Serializable {
|
|||
return kv != null && StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue());
|
||||
}
|
||||
|
||||
private List<String> mapFields(final RelatedEntityWrapper link, final Set<String> contexts) {
|
||||
private List<String> mapFields(final TemplateFactory templateFactory, final RelatedEntityWrapper link,
|
||||
final Set<String> contexts) {
|
||||
final Relation rel = link.getRelation();
|
||||
final RelatedEntity re = link.getTarget();
|
||||
final String targetType = link.getTarget().getType();
|
||||
|
@ -1074,6 +1077,15 @@ public class XmlRecordFactory implements Serializable {
|
|||
if (StringUtils.isNotBlank(re.getDescription())) {
|
||||
metadata.add(XmlSerializationUtils.asXmlElement("description", re.getDescription()));
|
||||
}
|
||||
if (re.getAuthor() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
re
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.map(author -> XmlSerializationUtils.asXmlElement("creator", author))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (isNotBlank(re.getDateofacceptance())) {
|
||||
metadata
|
||||
.add(XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance()));
|
||||
|
@ -1107,6 +1119,54 @@ public class XmlRecordFactory implements Serializable {
|
|||
.map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (re.getInstances() != null) {
|
||||
re
|
||||
.getInstances()
|
||||
.forEach(i -> {
|
||||
final List<String> instanceFields = Lists.newArrayList();
|
||||
if (i.getAccessright() != null && !i.getAccessright().isBlank()) {
|
||||
instanceFields
|
||||
.add(XmlSerializationUtils.mapQualifier("accessright", i.getAccessright()));
|
||||
}
|
||||
if (i.getHostedby() != null) {
|
||||
instanceFields.add(XmlSerializationUtils.mapKeyValue("hostedby", i.getHostedby()));
|
||||
}
|
||||
if (i.getDateofacceptance() != null && isNotBlank(i.getDateofacceptance().getValue())) {
|
||||
instanceFields
|
||||
.add(
|
||||
XmlSerializationUtils
|
||||
.asXmlElement("dateofacceptance", i.getDateofacceptance().getValue()));
|
||||
}
|
||||
if (i.getInstancetype() != null && !i.getInstancetype().isBlank()) {
|
||||
instanceFields
|
||||
.add(XmlSerializationUtils.mapQualifier("instancetype", i.getInstancetype()));
|
||||
}
|
||||
|
||||
if (i.getRefereed() != null && !i.getRefereed().isBlank()) {
|
||||
instanceFields.add(XmlSerializationUtils.mapQualifier("refereed", i.getRefereed()));
|
||||
}
|
||||
|
||||
if (i.getLicense() != null && isNotBlank(i.getLicense().getValue())) {
|
||||
instanceFields
|
||||
.add(XmlSerializationUtils.asXmlElement("license", i.getLicense().getValue()));
|
||||
}
|
||||
if (isNotBlank(i.getFulltext())) {
|
||||
instanceFields.add(XmlSerializationUtils.asXmlElement("fulltext", i.getFulltext()));
|
||||
}
|
||||
if (i.getUrl() != null && !i.getUrl().isEmpty()) {
|
||||
instanceFields
|
||||
.addAll(
|
||||
i
|
||||
.getUrl()
|
||||
.stream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(url -> XmlSerializationUtils.asXmlElement("url", url))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
metadata.add(templateFactory.getInstance(instanceFields, i.getUrl()));
|
||||
});
|
||||
}
|
||||
|
||||
break;
|
||||
case datasource:
|
||||
if (isNotBlank(re.getOfficialname())) {
|
||||
|
@ -1188,7 +1248,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
throw new IllegalArgumentException(
|
||||
String.format("missing scheme for: <%s - %s>", type, targetType));
|
||||
}
|
||||
final HashSet<String> fields = Sets.newHashSet(mapFields(link, contexts));
|
||||
final HashSet<String> fields = Sets.newHashSet(mapFields(templateFactory, link, contexts));
|
||||
if (rel.getValidated() == null) {
|
||||
rel.setValidated(false);
|
||||
}
|
||||
|
@ -1212,7 +1272,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
.map(link -> {
|
||||
final String targetType = link.getTarget().getType();
|
||||
final String name = ModelSupport.getMainType(EntityType.valueOf(targetType));
|
||||
final HashSet<String> fields = Sets.newHashSet(mapFields(link, null));
|
||||
final HashSet<String> fields = Sets.newHashSet(mapFields(templateFactory, link, null));
|
||||
return templateFactory
|
||||
.getChild(name, link.getTarget().getId(), Lists.newArrayList(fields));
|
||||
})
|
||||
|
|
Loading…
Reference in New Issue