Merge pull request 'subjects cleaning' (#239) from clean_subjects into beta

Reviewed-on: D-Net/dnet-hadoop#239
This commit is contained in:
Claudio Atzori 2022-09-09 15:17:02 +02:00
commit 5066db3386
29 changed files with 320 additions and 119 deletions

View File

@ -83,4 +83,10 @@ public class Vocabulary implements Serializable {
.orElse(null); .orElse(null);
} }
public Qualifier lookup(String id) {
return Optional
.ofNullable(getSynonymAsQualifier(id))
.orElse(getTermAsQualifier(id));
}
} }

View File

@ -81,6 +81,13 @@ public class VocabularyGroup implements Serializable {
vocs.put(id.toLowerCase(), new Vocabulary(id, name)); vocs.put(id.toLowerCase(), new Vocabulary(id, name));
} }
public Optional<Vocabulary> find(final String vocId) {
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(vocs::get);
}
public void addTerm(final String vocId, final String id, final String name) { public void addTerm(final String vocId, final String id, final String name) {
if (vocabularyExists(vocId)) { if (vocabularyExists(vocId)) {
vocs.get(vocId.toLowerCase()).addTerm(id, name); vocs.get(vocId.toLowerCase()).addTerm(id, name);

View File

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.ZoneId; import java.time.ZoneId;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
@ -16,7 +18,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -191,8 +192,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
} }
if (Objects.nonNull(r.getSubject())) { if (Objects.nonNull(r.getSubject())) {
r List<Subject> subjects = Lists
.setSubject( .newArrayList(
r r
.getSubject() .getSubject()
.stream() .stream()
@ -201,7 +202,18 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(GraphCleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(
Collectors
.toMap(
s -> Optional
.ofNullable(s.getQualifier())
.map(q -> q.getClassid() + s.getValue())
.orElse(s.getValue()),
Function.identity(),
(s1, s2) -> Collections
.min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator())))
.values());
r.setSubject(subjects);
} }
if (Objects.nonNull(r.getTitle())) { if (Objects.nonNull(r.getTitle())) {
r r
@ -382,14 +394,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(p -> StringUtils.isNotBlank(p.getValue())) .filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> { .map(p -> {
// hack to distinguish orcid from orcid_pending // hack to distinguish orcid from orcid_pending
String pidProvenance = Optional String pidProvenance = getProvenance(p.getDataInfo());
.ofNullable(p.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
if (p if (p
.getQualifier() .getQualifier()
.getClassid() .getClassid()
@ -520,6 +525,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return s; return s;
} }
protected static Subject cleanValue(Subject s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
protected static Field<String> cleanValue(Field<String> s) { protected static Field<String> cleanValue(Field<String> s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s; return s;

View File

@ -14,6 +14,7 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator; import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
@ -141,7 +142,7 @@ public class OafMapperUtils {
} }
public static Qualifier unknown(final String schemeid, final String schemename) { public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename); return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
} }
public static AccessRight accessRight( public static AccessRight accessRight(
@ -189,6 +190,17 @@ public class OafMapperUtils {
return q; return q;
} }
public static Subject subject(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty( public static StructuredProperty structuredProperty(
final String value, final String value,
final String classid, final String classid,
@ -200,6 +212,20 @@ public class OafMapperUtils {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
} }
public static Subject subject(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final Subject s = new Subject();
s.setValue(value);
s.setQualifier(qualifier);
s.setDataInfo(dataInfo);
return s;
}
public static StructuredProperty structuredProperty( public static StructuredProperty structuredProperty(
final String value, final String value,
final Qualifier qualifier, final Qualifier qualifier,
@ -477,4 +503,15 @@ public class OafMapperUtils {
rel.setProperties(properties); rel.setProperties(properties);
return rel; return rel;
} }
public static String getProvenance(DataInfo dataInfo) {
return Optional
.ofNullable(dataInfo)
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
}
} }

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import static org.apache.commons.lang3.StringUtils.isBlank;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.Subject;
public class SubjectProvenanceComparator implements Comparator<Subject> {
@Override
public int compare(Subject left, Subject right) {
String lProv = getProvenance(left.getDataInfo());
String rProv = getProvenance(right.getDataInfo());
if (isBlank(lProv) && isBlank(rProv))
return 0;
if (isBlank(lProv))
return 1;
if (isBlank(rProv))
return -1;
if (lProv.equals(rProv))
return 0;
if (lProv.toLowerCase().contains("crosswalk"))
return -1;
if (rProv.toLowerCase().contains("crosswalk"))
return 1;
if (lProv.toLowerCase().contains("user"))
return -1;
if (rProv.toLowerCase().contains("user"))
return 1;
if (lProv.toLowerCase().contains("propagation"))
return -1;
if (rProv.toLowerCase().contains("propagation"))
return 1;
if (lProv.toLowerCase().contains("iis"))
return -1;
if (rProv.toLowerCase().contains("iis"))
return 1;
return 0;
}
}

View File

@ -13,6 +13,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Constants { public class Constants {
@ -58,13 +59,13 @@ public class Constants {
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); .map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
} }
public static StructuredProperty getSubject(String sbj, String classid, String classname, public static Subject getSubject(String sbj, String classid, String classname,
String diqualifierclassid) { String diqualifierclassid) {
if (sbj.equals(NULL)) if (sbj.equals(NULL))
return null; return null;
StructuredProperty sp = new StructuredProperty(); Subject s = new Subject();
sp.setValue(sbj); s.setValue(sbj);
sp s
.setQualifier( .setQualifier(
OafMapperUtils OafMapperUtils
.qualifier( .qualifier(
@ -72,7 +73,7 @@ public class Constants {
classname, classname,
ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES)); ModelConstants.DNET_SUBJECT_TYPOLOGIES));
sp s
.setDataInfo( .setDataInfo(
OafMapperUtils OafMapperUtils
.dataInfo( .dataInfo(
@ -88,7 +89,7 @@ public class Constants {
ModelConstants.DNET_PROVENANCE_ACTIONS), ModelConstants.DNET_PROVENANCE_ACTIONS),
"")); ""));
return sp; return s;
} }
} }

View File

@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
@ -79,7 +80,7 @@ public class PrepareFOSSparkJob implements Serializable {
HashSet<String> level3 = new HashSet<>(); HashSet<String> level3 = new HashSet<>();
addLevels(level1, level2, level3, first); addLevels(level1, level2, level3, first);
it.forEachRemaining(v -> addLevels(level1, level2, level3, v)); it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
List<StructuredProperty> sbjs = new ArrayList<>(); List<Subject> sbjs = new ArrayList<>();
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));

View File

@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
@ -73,7 +74,7 @@ public class PrepareSDGSparkJob implements Serializable {
Result r = new Result(); Result r = new Result();
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI)); r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
SDGDataModel first = it.next(); SDGDataModel first = it.next();
List<StructuredProperty> sbjs = new ArrayList<>(); List<Subject> sbjs = new ArrayList<>();
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)); sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
it it
.forEachRemaining( .forEachRemaining(

View File

@ -19,7 +19,7 @@ import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter import java.time.format.DateTimeFormatter
import java.util.{Date, Locale} import java.util.{Date, Locale}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.{Codec, Source} import scala.io.Source
object DataciteToOAFTransformation { object DataciteToOAFTransformation {
@ -252,7 +252,7 @@ object DataciteToOAFTransformation {
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue)) .exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) { if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT())) r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
val l: List[StructuredProperty] = List() val l: List[Subject] = List()
r.setSubject(l.asJava) r.setSubject(l.asJava)
} }
} }
@ -492,7 +492,7 @@ object DataciteToOAFTransformation {
subjects subjects
.filter(s => s.subject.nonEmpty) .filter(s => s.subject.nonEmpty)
.map(s => .map(s =>
OafMapperUtils.structuredProperty( OafMapperUtils.subject(
s.subject.get, s.subject.get,
SUBJ_CLASS, SUBJ_CLASS,
SUBJ_CLASS, SUBJ_CLASS,

View File

@ -281,7 +281,7 @@ object BioDBToOAF {
d.setSubject( d.setSubject(
subjects subjects
.map(s => .map(s =>
OafMapperUtils.structuredProperty( OafMapperUtils.subject(
s, s,
SUBJ_CLASS, SUBJ_CLASS,
SUBJ_CLASS, SUBJ_CLASS,

View File

@ -265,8 +265,8 @@ object PubMedToOaf {
result.setLanguage(term) result.setLanguage(term)
} }
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => val subjects: List[Subject] = article.getSubjects.asScala.map(s =>
OafMapperUtils.structuredProperty( OafMapperUtils.subject(
s.getValue, s.getValue,
SUBJ_CLASS, SUBJ_CLASS,
SUBJ_CLASS, SUBJ_CLASS,

View File

@ -72,7 +72,7 @@ public class ProduceTest {
JavaRDD<Result> tmp = getResultJavaRDD(); JavaRDD<Result> tmp = getResultJavaRDD();
List<StructuredProperty> sbjs = tmp List<Subject> sbjs = tmp
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0) .filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
.flatMap(row -> row.getSubject().iterator()) .flatMap(row -> row.getSubject().iterator())
.collect(); .collect();
@ -169,7 +169,7 @@ public class ProduceTest {
.getSubject() .getSubject()
.size()); .size());
List<StructuredProperty> sbjs = tmp List<Subject> sbjs = tmp
.filter(row -> row.getId().equals(doi)) .filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator()) .flatMap(row -> row.getSubject().iterator())
.collect(); .collect();
@ -396,7 +396,7 @@ public class ProduceTest {
.getSubject() .getSubject()
.size()); .size());
List<StructuredProperty> sbjs = tmp List<Subject> sbjs = tmp
.filter(row -> row.getId().equals(doi)) .filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator()) .flatMap(row -> row.getSubject().iterator())
.collect(); .collect();
@ -508,7 +508,7 @@ public class ProduceTest {
.getSubject() .getSubject()
.size()); .size());
List<StructuredProperty> sbjs = tmp List<Subject> sbjs = tmp
.filter(row -> row.getId().equals(doi)) .filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator()) .flatMap(row -> row.getSubject().iterator())
.collect(); .collect();
@ -537,7 +537,7 @@ public class ProduceTest {
JavaRDD<Result> tmp = getResultJavaRDDPlusSDG(); JavaRDD<Result> tmp = getResultJavaRDDPlusSDG();
List<StructuredProperty> sbjs_sdg = tmp List<Subject> sbjs_sdg = tmp
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0) .filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
.flatMap(row -> row.getSubject().iterator()) .flatMap(row -> row.getSubject().iterator())
.filter(sbj -> sbj.getQualifier().getClassid().equals(Constants.SDG_CLASS_ID)) .filter(sbj -> sbj.getQualifier().getClassid().equals(Constants.SDG_CLASS_ID))

View File

@ -26,20 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.ExternalReference;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ConversionUtils { public class ConversionUtils {
@ -71,6 +58,10 @@ public class ConversionUtils {
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null; return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
} }
public static OaBrokerTypedValue oafSubjectToBrokerTypedValue(final Subject sp) {
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
}
public static OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) { public static OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
if (d == null) { if (d == null) {
return null; return null;
@ -115,7 +106,7 @@ public class ConversionUtils {
res.setTitles(structPropList(result.getTitle())); res.setTitles(structPropList(result.getTitle()));
res.setAbstracts(fieldList(result.getDescription())); res.setAbstracts(fieldList(result.getDescription()));
res.setLanguage(classId(result.getLanguage())); res.setLanguage(classId(result.getLanguage()));
res.setSubjects(structPropTypedList(result.getSubject())); res.setSubjects(subjectList(result.getSubject()));
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor)); res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
res.setPublicationdate(fieldValue(result.getDateofacceptance())); res.setPublicationdate(fieldValue(result.getDateofacceptance()));
res.setPublisher(fieldValue(result.getPublisher())); res.setPublisher(fieldValue(result.getPublisher()));
@ -304,6 +295,18 @@ public class ConversionUtils {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
private static List<OaBrokerTypedValue> subjectList(final List<Subject> list) {
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
.map(ConversionUtils::oafSubjectToBrokerTypedValue)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) { private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
if (list == null) { if (list == null) {
return new ArrayList<>(); return new ArrayList<>();

View File

@ -391,6 +391,28 @@ object DoiBoostMappingUtil {
di di
} }
def createSubject(value: String, classId: String, schemeId: String): Subject = {
val s = new Subject
s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
s.setValue(value)
s
}
def createSubject(
value: String,
classId: String,
className: String,
schemeId: String,
schemeName: String
): Subject = {
val s = new Subject
s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
s.setValue(value)
s
}
def createSP( def createSP(
value: String, value: String,
classId: String, classId: String,

View File

@ -201,7 +201,7 @@ case object Crossref2Oaf {
if (subjectList.nonEmpty) { if (subjectList.nonEmpty) {
result.setSubject( result.setSubject(
subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
) )
} }

View File

@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty, Subject}
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.json4s import org.json4s
@ -210,8 +210,8 @@ case object ConversionUtil {
val className = "Microsoft Academic Graph classification" val className = "Microsoft Academic Graph classification"
val classid = "MAG" val classid = "MAG"
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => { val p: List[Subject] = fieldOfStudy.subjects.flatMap(s => {
val s1 = createSP( val s1 = createSubject(
s.DisplayName, s.DisplayName,
classid, classid,
className, className,
@ -219,10 +219,10 @@ case object ConversionUtil {
ModelConstants.DNET_SUBJECT_TYPOLOGIES ModelConstants.DNET_SUBJECT_TYPOLOGIES
) )
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString) val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
var resList: List[StructuredProperty] = List(s1) var resList: List[Subject] = List(s1)
if (s.MainType.isDefined) { if (s.MainType.isDefined) {
val maintp = s.MainType.get val maintp = s.MainType.get
val s2 = createSP( val s2 = createSubject(
s.MainType.get, s.MainType.get,
classid, classid,
className, className,
@ -232,7 +232,7 @@ case object ConversionUtil {
s2.setDataInfo(di) s2.setDataInfo(di)
resList = resList ::: List(s2) resList = resList ::: List(s2)
if (maintp.contains(".")) { if (maintp.contains(".")) {
val s3 = createSP( val s3 = createSubject(
maintp.split("\\.").head, maintp.split("\\.").head,
classid, classid,
className, className,

View File

@ -290,7 +290,7 @@ public class EOSCTagJobTest {
.stream() .stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
List<StructuredProperty> subjects = tmp List<Subject> subjects = tmp
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
.collect() .collect()
.get(0) .get(0)

View File

@ -3,16 +3,15 @@ package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.lang3.SerializationUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.AccessRight; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable { public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable {
@ -27,9 +26,36 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o)); mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o)); mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o)); mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
return mapping; return mapping;
} }
private static void cleanSubject(VocabularyGroup vocabularies, Subject subject) {
if (cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, subject)) {
return;
} else {
// TODO cleaning based on different subject vocabs can be added here
}
}
private static boolean cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
Subject subject) {
AtomicReference<Boolean> modified = new AtomicReference<>(false);
vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
return;
}
Qualifier newValue = vocabulary.lookup(subject.getValue());
if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) {
subject.setValue(newValue.getClassid());
subject.getQualifier().setClassid(vocabularyId);
subject.getQualifier().setClassname(vocabulary.getName());
modified.set(true);
}
});
return modified.get();
}
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) { private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {
if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) { if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) {
Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType()); Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType());

View File

@ -8,15 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS; import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.keyValue;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.util.*; import java.util.*;
@ -29,26 +21,7 @@ import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.AccessRight; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ -411,7 +384,7 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info); protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info); protected abstract List<Subject> prepareSubjects(Document doc, DataInfo info);
protected abstract Qualifier prepareLanguages(Document doc); protected abstract Qualifier prepareLanguages(Document doc);
@ -559,6 +532,22 @@ public abstract class AbstractMdRecordToOafMapper {
return res; return res;
} }
protected List<Subject> prepareSubjectList(
final Node node,
final String xpath,
final DataInfo info) {
final List<Subject> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
res
.add(
subject(
n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"),
n.valueOf("@schemename"), info));
}
return res;
}
protected OAIProvenance prepareOAIprovenance(final Document doc) { protected OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");

View File

@ -84,8 +84,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
} }
@Override @Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) { protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info); return prepareSubjectList(doc, "//dc:subject", info);
} }
@Override @Override

View File

@ -179,7 +179,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
} }
for (final Object o : doc for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
} }
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
@ -257,8 +257,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
} }
@Override @Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) { protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//*[local-name()='subject']", info); return prepareSubjectList(doc, "//*[local-name()='subject']", info);
} }
@Override @Override

View File

@ -251,6 +251,33 @@ public class GraphCleaningFunctionsTest {
pid.getQualifier().getClassname())); pid.getQualifier().getClassname()));
}); });
assertNotNull(p_cleaned.getSubject());
List<Subject> fos_subjects = p_cleaned
.getSubject()
.stream()
.filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))
.collect(Collectors.toList());
assertNotNull(fos_subjects);
assertEquals(2, fos_subjects.size());
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0101 mathematics".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"sysimport:crosswalk:datasetarchive"
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0102 computer and information sciences".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
// TODO add more assertions to verity the cleaned values // TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_cleaned)); System.out.println(MAPPER.writeValueAsString(p_cleaned));
} }

View File

@ -928,7 +928,8 @@ class MappersTest {
@Test @Test
void testROHub2() throws IOException, DocumentException { void testROHub2() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml"))); final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println(new ObjectMapper().writeValueAsString(list));

View File

@ -743,12 +743,12 @@
"trust": "0.9" "trust": "0.9"
}, },
"qualifier": { "qualifier": {
"classid": "", "classid": "keyword",
"classname": "", "classname": "keyword",
"schemeid": "", "schemeid": "dnet:subject_classification_typologies",
"schemename": "" "schemename": "dnet:subject_classification_typologies"
}, },
"value": "infrared detectors" "value": "FOS: Mathematics"
}, },
{ {
"dataInfo": { "dataInfo": {
@ -765,12 +765,12 @@
"trust": "0.9" "trust": "0.9"
}, },
"qualifier": { "qualifier": {
"classid": "", "classid": "keyword",
"classname": "", "classname": "keyword",
"schemeid": "", "schemeid": "dnet:subject_classification_typologies",
"schemename": "" "schemename": "dnet:subject_classification_typologies"
}, },
"value": "lens antennas" "value": "FOS: Computer and information sciences"
}, },
{ {
"dataInfo": { "dataInfo": {
@ -787,12 +787,34 @@
"trust": "0.9" "trust": "0.9"
}, },
"qualifier": { "qualifier": {
"classid": "", "classid": "keyword",
"classname": "", "classname": "keyword",
"schemeid": "", "schemeid": "dnet:subject_classification_typologies",
"schemename": "" "schemename": "dnet:subject_classification_typologies"
}, },
"value": "silicon" "value": "0101 mathematics"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "subject:fos",
"classname": "subject:fos",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "0101 mathematics"
}, },
{ {
"dataInfo": { "dataInfo": {

View File

@ -1243,4 +1243,6 @@ dnet:relation_relClass @=@ IsSupplementTo @=@ isSupplementTo
dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy
dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo
dnet:relation_subRelType @=@ relationship @=@ publicationDataset dnet:relation_subRelType @=@ relationship @=@ publicationDataset
dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned
FOS @=@ 0101 mathematics @=@ FOS: Mathematics
FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences

View File

@ -1118,4 +1118,7 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ relationship @=@ relat
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version
FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics
FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences
FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences

View File

@ -59,7 +59,7 @@ class ResolveEntitiesTest extends Serializable {
r.setId(id.toLowerCase.trim) r.setId(id.toLowerCase.trim)
r.setSubject( r.setSubject(
List( List(
OafMapperUtils.structuredProperty( OafMapperUtils.subject(
FAKE_SUBJECT, FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"), OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null null
@ -250,7 +250,7 @@ class ResolveEntitiesTest extends Serializable {
val r = new Result val r = new Result
r.setSubject( r.setSubject(
List( List(
OafMapperUtils.structuredProperty( OafMapperUtils.subject(
FAKE_SUBJECT, FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"), OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null null

View File

@ -85,7 +85,7 @@ public class IndexRecordTransformerTest {
public void testRiunet() throws IOException, TransformerException { public void testRiunet() throws IOException, TransformerException {
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); XmlConverterJob.schemaLocation);
final Publication p = load("riunet.json", Publication.class); final Publication p = load("riunet.json", Publication.class);
@ -95,7 +95,6 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record); testRecordTransformation(record);
} }
@Test @Test
public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException { public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException {
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml")); final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml"));
@ -129,8 +128,6 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record); testRecordTransformation(record);
} }
@Test @Test
void testDoiUrlNormalization() throws MalformedURLException { void testDoiUrlNormalization() throws MalformedURLException {

View File

@ -801,7 +801,7 @@
<mockito-core.version>3.3.3</mockito-core.version> <mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.12.2-SNAPSHOT]</dhp-schemas.version> <dhp-schemas.version>[3.14.0]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>