WIP: cleaning of subjects

This commit is contained in:
Claudio Atzori 2022-08-04 11:39:39 +02:00
parent efd96e7e66
commit 27a91841e7
21 changed files with 132 additions and 85 deletions

View File

@ -520,6 +520,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return s;
}
protected static Subject cleanValue(Subject s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
protected static Field<String> cleanValue(Field<String> s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;

View File

@ -189,6 +189,17 @@ public class OafMapperUtils {
return q;
}
public static Subject subject(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
@ -200,6 +211,20 @@ public class OafMapperUtils {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static Subject subject(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final Subject s = new Subject();
s.setValue(value);
s.setQualifier(qualifier);
s.setDataInfo(dataInfo);
return s;
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier,

View File

@ -13,6 +13,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Constants {
@ -58,13 +59,13 @@ public class Constants {
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static StructuredProperty getSubject(String sbj, String classid, String classname,
public static Subject getSubject(String sbj, String classid, String classname,
String diqualifierclassid) {
if (sbj.equals(NULL))
return null;
StructuredProperty sp = new StructuredProperty();
sp.setValue(sbj);
sp
Subject s = new Subject();
s.setValue(sbj);
s
.setQualifier(
OafMapperUtils
.qualifier(
@ -72,7 +73,7 @@ public class Constants {
classname,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
sp
s
.setDataInfo(
OafMapperUtils
.dataInfo(
@ -88,7 +89,7 @@ public class Constants {
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return sp;
return s;
}
}

View File

@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
@ -79,7 +80,7 @@ public class PrepareFOSSparkJob implements Serializable {
HashSet<String> level3 = new HashSet<>();
addLevels(level1, level2, level3, first);
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
List<StructuredProperty> sbjs = new ArrayList<>();
List<Subject> sbjs = new ArrayList<>();
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));

View File

@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
@ -73,7 +74,7 @@ public class PrepareSDGSparkJob implements Serializable {
Result r = new Result();
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
SDGDataModel first = it.next();
List<StructuredProperty> sbjs = new ArrayList<>();
List<Subject> sbjs = new ArrayList<>();
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
it
.forEachRemaining(

View File

@ -19,7 +19,7 @@ import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
import scala.io.Source
object DataciteToOAFTransformation {
@ -252,7 +252,7 @@ object DataciteToOAFTransformation {
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
val l: List[StructuredProperty] = List()
val l: List[Subject] = List()
r.setSubject(l.asJava)
}
}
@ -492,7 +492,7 @@ object DataciteToOAFTransformation {
subjects
.filter(s => s.subject.nonEmpty)
.map(s =>
OafMapperUtils.structuredProperty(
OafMapperUtils.subject(
s.subject.get,
SUBJ_CLASS,
SUBJ_CLASS,

View File

@ -281,7 +281,7 @@ object BioDBToOAF {
d.setSubject(
subjects
.map(s =>
OafMapperUtils.structuredProperty(
OafMapperUtils.subject(
s,
SUBJ_CLASS,
SUBJ_CLASS,

View File

@ -265,8 +265,8 @@ object PubMedToOaf {
result.setLanguage(term)
}
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
OafMapperUtils.structuredProperty(
val subjects: List[Subject] = article.getSubjects.asScala.map(s =>
OafMapperUtils.subject(
s.getValue,
SUBJ_CLASS,
SUBJ_CLASS,

View File

@ -72,7 +72,7 @@ public class ProduceTest {
JavaRDD<Result> tmp = getResultJavaRDD();
List<StructuredProperty> sbjs = tmp
List<Subject> sbjs = tmp
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
.flatMap(row -> row.getSubject().iterator())
.collect();
@ -169,7 +169,7 @@ public class ProduceTest {
.getSubject()
.size());
List<StructuredProperty> sbjs = tmp
List<Subject> sbjs = tmp
.filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator())
.collect();
@ -396,7 +396,7 @@ public class ProduceTest {
.getSubject()
.size());
List<StructuredProperty> sbjs = tmp
List<Subject> sbjs = tmp
.filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator())
.collect();
@ -508,7 +508,7 @@ public class ProduceTest {
.getSubject()
.size());
List<StructuredProperty> sbjs = tmp
List<Subject> sbjs = tmp
.filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator())
.collect();
@ -537,7 +537,7 @@ public class ProduceTest {
JavaRDD<Result> tmp = getResultJavaRDDPlusSDG();
List<StructuredProperty> sbjs_sdg = tmp
List<Subject> sbjs_sdg = tmp
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
.flatMap(row -> row.getSubject().iterator())
.filter(sbj -> sbj.getQualifier().getClassid().equals(Constants.SDG_CLASS_ID))

View File

@ -26,20 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.ExternalReference;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
public class ConversionUtils {
@ -71,6 +58,10 @@ public class ConversionUtils {
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
}
public static OaBrokerTypedValue oafSubjectToBrokerTypedValue(final Subject sp) {
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
}
public static OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
if (d == null) {
return null;
@ -115,7 +106,7 @@ public class ConversionUtils {
res.setTitles(structPropList(result.getTitle()));
res.setAbstracts(fieldList(result.getDescription()));
res.setLanguage(classId(result.getLanguage()));
res.setSubjects(structPropTypedList(result.getSubject()));
res.setSubjects(subjectList(result.getSubject()));
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
res.setPublicationdate(fieldValue(result.getDateofacceptance()));
res.setPublisher(fieldValue(result.getPublisher()));
@ -304,6 +295,18 @@ public class ConversionUtils {
.collect(Collectors.toList());
}
private static List<OaBrokerTypedValue> subjectList(final List<Subject> list) {
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
.map(ConversionUtils::oafSubjectToBrokerTypedValue)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
if (list == null) {
return new ArrayList<>();

View File

@ -391,6 +391,28 @@ object DoiBoostMappingUtil {
di
}
def createSubject(value: String, classId: String, schemeId: String): Subject = {
val s = new Subject
s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
s.setValue(value)
s
}
def createSubject(
value: String,
classId: String,
className: String,
schemeId: String,
schemeName: String
): Subject = {
val s = new Subject
s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
s.setValue(value)
s
}
def createSP(
value: String,
classId: String,

View File

@ -201,7 +201,7 @@ case object Crossref2Oaf {
if (subjectList.nonEmpty) {
result.setSubject(
subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
)
}

View File

@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty, Subject}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.json4s
@ -210,8 +210,8 @@ case object ConversionUtil {
val className = "Microsoft Academic Graph classification"
val classid = "MAG"
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
val s1 = createSP(
val p: List[Subject] = fieldOfStudy.subjects.flatMap(s => {
val s1 = createSubject(
s.DisplayName,
classid,
className,
@ -219,10 +219,10 @@ case object ConversionUtil {
ModelConstants.DNET_SUBJECT_TYPOLOGIES
)
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
var resList: List[StructuredProperty] = List(s1)
var resList: List[Subject] = List(s1)
if (s.MainType.isDefined) {
val maintp = s.MainType.get
val s2 = createSP(
val s2 = createSubject(
s.MainType.get,
classid,
className,
@ -232,7 +232,7 @@ case object ConversionUtil {
s2.setDataInfo(di)
resList = resList ::: List(s2)
if (maintp.contains(".")) {
val s3 = createSP(
val s3 = createSubject(
maintp.split("\\.").head,
classid,
className,

View File

@ -290,7 +290,7 @@ public class EOSCTagJobTest {
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
List<StructuredProperty> subjects = tmp
List<Subject> subjects = tmp
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
.collect()
.get(0)

View File

@ -27,6 +27,7 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
return mapping;
}

View File

@ -17,6 +17,7 @@ import eu.dnetlib.dhp.schema.dump.oaf.Instance;
import eu.dnetlib.dhp.schema.dump.oaf.Measure;
import eu.dnetlib.dhp.schema.dump.oaf.OpenAccessRoute;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
import eu.dnetlib.dhp.schema.dump.oaf.Subject;
import eu.dnetlib.dhp.schema.dump.oaf.community.CfHbKeyValue;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
@ -66,7 +67,7 @@ public class ResultMapper implements Serializable {
final List<String> contributorList = new ArrayList<>();
Optional
.ofNullable(input.getContributor())
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
.ifPresent(value -> value.forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList);
Optional
@ -103,7 +104,7 @@ public class ResultMapper implements Serializable {
final List<String> coverageList = new ArrayList<>();
Optional
.ofNullable(input.getCoverage())
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
.ifPresent(value -> value.forEach(c -> coverageList.add(c.getValue())));
out.setCoverage(coverageList);
out.setDateofcollection(input.getDateofcollection());
@ -114,14 +115,12 @@ public class ResultMapper implements Serializable {
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setDescription(descriptionList);
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
if (oStr.isPresent()) {
out.setEmbargoenddate(oStr.get().getValue());
}
oStr.ifPresent(stringField -> out.setEmbargoenddate(stringField.getValue()));
final List<String> formatList = new ArrayList<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
.ifPresent(value -> value.forEach(f -> formatList.add(f.getValue())));
out.setFormat(formatList);
out.setId(input.getId());
out.setOriginalId(new ArrayList<>());

View File

@ -8,15 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.keyValue;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.util.*;
@ -29,26 +21,7 @@ import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.AccessRight;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ -411,7 +384,7 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
protected abstract List<Subject> prepareSubjects(Document doc, DataInfo info);
protected abstract Qualifier prepareLanguages(Document doc);
@ -559,6 +532,22 @@ public abstract class AbstractMdRecordToOafMapper {
return res;
}
protected List<Subject> prepareSubjectList(
final Node node,
final String xpath,
final DataInfo info) {
final List<Subject> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
res
.add(
subject(
n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"),
n.valueOf("@schemename"), info));
}
return res;
}
protected OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");

View File

@ -84,8 +84,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info);
protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
return prepareSubjectList(doc, "//dc:subject", info);
}
@Override

View File

@ -249,8 +249,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//*[local-name()='subject']", info);
protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
return prepareSubjectList(doc, "//*[local-name()='subject']", info);
}
@Override

View File

@ -59,7 +59,7 @@ class ResolveEntitiesTest extends Serializable {
r.setId(id.toLowerCase.trim)
r.setSubject(
List(
OafMapperUtils.structuredProperty(
OafMapperUtils.subject(
FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null
@ -250,7 +250,7 @@ class ResolveEntitiesTest extends Serializable {
val r = new Result
r.setSubject(
List(
OafMapperUtils.structuredProperty(
OafMapperUtils.subject(
FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null

View File

@ -801,7 +801,7 @@
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.12.2-SNAPSHOT]</dhp-schemas.version>
<dhp-schemas.version>[2.13.2-SNAPSHOT]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>