subjects cleaning #239

Merged
claudio.atzori merged 15 commits from clean_subjects into beta 2022-09-09 15:17:09 +02:00
29 changed files with 320 additions and 119 deletions

View File

@ -83,4 +83,10 @@ public class Vocabulary implements Serializable {
.orElse(null);
}
public Qualifier lookup(String id) {
return Optional
.ofNullable(getSynonymAsQualifier(id))
.orElse(getTermAsQualifier(id));
}
}

View File

@ -81,6 +81,13 @@ public class VocabularyGroup implements Serializable {
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
}
public Optional<Vocabulary> find(final String vocId) {
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(vocs::get);
}
public void addTerm(final String vocId, final String id, final String name) {
if (vocabularyExists(vocId)) {
vocs.get(vocId.toLowerCase()).addTerm(id, name);

View File

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
@ -16,7 +18,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -191,8 +192,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
List<Subject> subjects = Lists
.newArrayList(
r
.getSubject()
.stream()
@ -201,7 +202,18 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
.collect(
Collectors
.toMap(
s -> Optional
.ofNullable(s.getQualifier())
.map(q -> q.getClassid() + s.getValue())
.orElse(s.getValue()),
Function.identity(),
(s1, s2) -> Collections
.min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator())))
.values());
r.setSubject(subjects);
}
if (Objects.nonNull(r.getTitle())) {
r
@ -382,14 +394,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
// hack to distinguish orcid from orcid_pending
String pidProvenance = Optional
.ofNullable(p.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
String pidProvenance = getProvenance(p.getDataInfo());
if (p
.getQualifier()
.getClassid()
@ -520,6 +525,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return s;
}
protected static Subject cleanValue(Subject s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
protected static Field<String> cleanValue(Field<String> s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;

View File

@ -14,6 +14,7 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
@ -141,7 +142,7 @@ public class OafMapperUtils {
}
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
}
public static AccessRight accessRight(
@ -189,6 +190,17 @@ public class OafMapperUtils {
return q;
}
public static Subject subject(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
@ -200,6 +212,20 @@ public class OafMapperUtils {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static Subject subject(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final Subject s = new Subject();
s.setValue(value);
s.setQualifier(qualifier);
s.setDataInfo(dataInfo);
return s;
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier,
@ -477,4 +503,15 @@ public class OafMapperUtils {
rel.setProperties(properties);
return rel;
}
public static String getProvenance(DataInfo dataInfo) {
return Optional
.ofNullable(dataInfo)
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import static org.apache.commons.lang3.StringUtils.isBlank;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.Subject;
public class SubjectProvenanceComparator implements Comparator<Subject> {
@Override
public int compare(Subject left, Subject right) {
String lProv = getProvenance(left.getDataInfo());
String rProv = getProvenance(right.getDataInfo());
if (isBlank(lProv) && isBlank(rProv))
return 0;
if (isBlank(lProv))
return 1;
if (isBlank(rProv))
return -1;
if (lProv.equals(rProv))
return 0;
if (lProv.toLowerCase().contains("crosswalk"))
return -1;
if (rProv.toLowerCase().contains("crosswalk"))
return 1;
if (lProv.toLowerCase().contains("user"))
return -1;
if (rProv.toLowerCase().contains("user"))
return 1;
if (lProv.toLowerCase().contains("propagation"))
return -1;
if (rProv.toLowerCase().contains("propagation"))
return 1;
if (lProv.toLowerCase().contains("iis"))
return -1;
if (rProv.toLowerCase().contains("iis"))
return 1;
return 0;
}
}

View File

@ -13,6 +13,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Constants {
@ -58,13 +59,13 @@ public class Constants {
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static StructuredProperty getSubject(String sbj, String classid, String classname,
public static Subject getSubject(String sbj, String classid, String classname,
String diqualifierclassid) {
if (sbj.equals(NULL))
return null;
StructuredProperty sp = new StructuredProperty();
sp.setValue(sbj);
sp
Subject s = new Subject();
s.setValue(sbj);
s
.setQualifier(
OafMapperUtils
.qualifier(
@ -72,7 +73,7 @@ public class Constants {
classname,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
sp
s
.setDataInfo(
OafMapperUtils
.dataInfo(
@ -88,7 +89,7 @@ public class Constants {
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return sp;
return s;
}
}

View File

@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
@ -79,7 +80,7 @@ public class PrepareFOSSparkJob implements Serializable {
HashSet<String> level3 = new HashSet<>();
addLevels(level1, level2, level3, first);
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
List<StructuredProperty> sbjs = new ArrayList<>();
List<Subject> sbjs = new ArrayList<>();
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));

View File

@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
@ -73,7 +74,7 @@ public class PrepareSDGSparkJob implements Serializable {
Result r = new Result();
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
SDGDataModel first = it.next();
List<StructuredProperty> sbjs = new ArrayList<>();
List<Subject> sbjs = new ArrayList<>();
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
it
.forEachRemaining(

View File

@ -19,7 +19,7 @@ import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
import scala.io.Source
object DataciteToOAFTransformation {
@ -252,7 +252,7 @@ object DataciteToOAFTransformation {
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
val l: List[StructuredProperty] = List()
val l: List[Subject] = List()
r.setSubject(l.asJava)
}
}
@ -492,7 +492,7 @@ object DataciteToOAFTransformation {
subjects
.filter(s => s.subject.nonEmpty)
.map(s =>
OafMapperUtils.structuredProperty(
OafMapperUtils.subject(
s.subject.get,
SUBJ_CLASS,
SUBJ_CLASS,

View File

@ -281,7 +281,7 @@ object BioDBToOAF {
d.setSubject(
subjects
.map(s =>
OafMapperUtils.structuredProperty(
OafMapperUtils.subject(
s,
SUBJ_CLASS,
SUBJ_CLASS,

View File

@ -265,8 +265,8 @@ object PubMedToOaf {
result.setLanguage(term)
}
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
OafMapperUtils.structuredProperty(
val subjects: List[Subject] = article.getSubjects.asScala.map(s =>
OafMapperUtils.subject(
s.getValue,
SUBJ_CLASS,
SUBJ_CLASS,

View File

@ -72,7 +72,7 @@ public class ProduceTest {
JavaRDD<Result> tmp = getResultJavaRDD();
List<StructuredProperty> sbjs = tmp
List<Subject> sbjs = tmp
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
.flatMap(row -> row.getSubject().iterator())
.collect();
@ -169,7 +169,7 @@ public class ProduceTest {
.getSubject()
.size());
List<StructuredProperty> sbjs = tmp
List<Subject> sbjs = tmp
.filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator())
.collect();
@ -396,7 +396,7 @@ public class ProduceTest {
.getSubject()
.size());
List<StructuredProperty> sbjs = tmp
List<Subject> sbjs = tmp
.filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator())
.collect();
@ -508,7 +508,7 @@ public class ProduceTest {
.getSubject()
.size());
List<StructuredProperty> sbjs = tmp
List<Subject> sbjs = tmp
.filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator())
.collect();
@ -537,7 +537,7 @@ public class ProduceTest {
JavaRDD<Result> tmp = getResultJavaRDDPlusSDG();
List<StructuredProperty> sbjs_sdg = tmp
List<Subject> sbjs_sdg = tmp
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
.flatMap(row -> row.getSubject().iterator())
.filter(sbj -> sbj.getQualifier().getClassid().equals(Constants.SDG_CLASS_ID))

View File

@ -26,20 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.ExternalReference;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
public class ConversionUtils {
@ -71,6 +58,10 @@ public class ConversionUtils {
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
}
public static OaBrokerTypedValue oafSubjectToBrokerTypedValue(final Subject sp) {
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
}
public static OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
if (d == null) {
return null;
@ -115,7 +106,7 @@ public class ConversionUtils {
res.setTitles(structPropList(result.getTitle()));
res.setAbstracts(fieldList(result.getDescription()));
res.setLanguage(classId(result.getLanguage()));
res.setSubjects(structPropTypedList(result.getSubject()));
res.setSubjects(subjectList(result.getSubject()));
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
res.setPublicationdate(fieldValue(result.getDateofacceptance()));
res.setPublisher(fieldValue(result.getPublisher()));
@ -304,6 +295,18 @@ public class ConversionUtils {
.collect(Collectors.toList());
}
private static List<OaBrokerTypedValue> subjectList(final List<Subject> list) {
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
.map(ConversionUtils::oafSubjectToBrokerTypedValue)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
if (list == null) {
return new ArrayList<>();

View File

@ -391,6 +391,28 @@ object DoiBoostMappingUtil {
di
}
def createSubject(value: String, classId: String, schemeId: String): Subject = {
val s = new Subject
s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
s.setValue(value)
s
}
def createSubject(
value: String,
classId: String,
className: String,
schemeId: String,
schemeName: String
): Subject = {
val s = new Subject
s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
s.setValue(value)
s
}
def createSP(
value: String,
classId: String,

View File

@ -201,7 +201,7 @@ case object Crossref2Oaf {
if (subjectList.nonEmpty) {
result.setSubject(
subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
)
}

View File

@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty, Subject}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.json4s
@ -210,8 +210,8 @@ case object ConversionUtil {
val className = "Microsoft Academic Graph classification"
val classid = "MAG"
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
val s1 = createSP(
val p: List[Subject] = fieldOfStudy.subjects.flatMap(s => {
val s1 = createSubject(
s.DisplayName,
classid,
className,
@ -219,10 +219,10 @@ case object ConversionUtil {
ModelConstants.DNET_SUBJECT_TYPOLOGIES
)
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
var resList: List[StructuredProperty] = List(s1)
var resList: List[Subject] = List(s1)
if (s.MainType.isDefined) {
val maintp = s.MainType.get
val s2 = createSP(
val s2 = createSubject(
s.MainType.get,
classid,
className,
@ -232,7 +232,7 @@ case object ConversionUtil {
s2.setDataInfo(di)
resList = resList ::: List(s2)
if (maintp.contains(".")) {
val s3 = createSP(
val s3 = createSubject(
maintp.split("\\.").head,
classid,
className,

View File

@ -290,7 +290,7 @@ public class EOSCTagJobTest {
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
List<StructuredProperty> subjects = tmp
List<Subject> subjects = tmp
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
.collect()
.get(0)

View File

@ -3,16 +3,15 @@ package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable;
import java.util.HashMap;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.lang3.SerializationUtils;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.AccessRight;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable {
@ -27,9 +26,36 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
return mapping;
}
private static void cleanSubject(VocabularyGroup vocabularies, Subject subject) {
if (cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, subject)) {
return;
} else {
// TODO cleaning based on different subject vocabs can be added here
}
}
private static boolean cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
Subject subject) {
AtomicReference<Boolean> modified = new AtomicReference<>(false);
vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
return;
}
Qualifier newValue = vocabulary.lookup(subject.getValue());
if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) {

I do not understand why you compare subject.value and newValue.classid

I do not understand why you compare subject.value and newValue.classid

I saw by myself. It is ok.

One thing: you will not change the classId of the subject if the value provided is equal to the term in the vocabulary.

I saw by myself. It is ok. One thing: you will not change the classId of the subject if the value provided is equal to the term in the vocabulary.

You are right. However, the vocabulary.lookup method already tryes to find match a synonym and in case it can't then it looks for a matching equivalent term:

public Qualifier lookup(String id) {
	return Optional
		.ofNullable(getSynonymAsQualifier(id))
		.orElse(getTermAsQualifier(id));
}

Then, in case a matching term cannot be found, the method returns a qualifier set to UNKNOWN

public Qualifier getTermAsQualifier(final String termId) {
	if (StringUtils.isBlank(termId)) {
		return OafMapperUtils.unknown(getId(), getName());
	} else if (termExists(termId)) {
		final VocabularyTerm t = getTerm(termId);
		return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
	} else {
		return OafMapperUtils.qualifier(termId, termId, getId(), getName());
	}
}

hence I could just exploit this to decide what to do when the value provided is equal to the term in the vocabulary.

You are right. However, the `vocabulary.lookup` method already tryes to find match a synonym and in case it can't then it looks for a matching equivalent term: ``` public Qualifier lookup(String id) { return Optional .ofNullable(getSynonymAsQualifier(id)) .orElse(getTermAsQualifier(id)); } ``` Then, in case a matching term cannot be found, the method returns a qualifier set to UNKNOWN ``` public Qualifier getTermAsQualifier(final String termId) { if (StringUtils.isBlank(termId)) { return OafMapperUtils.unknown(getId(), getName()); } else if (termExists(termId)) { final VocabularyTerm t = getTerm(termId); return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName()); } else { return OafMapperUtils.qualifier(termId, termId, getId(), getName()); } } ``` hence I could just exploit this to decide what to do when the value provided is equal to the term in the vocabulary.

Yes, I do agree

Yes, I do agree
subject.setValue(newValue.getClassid());
subject.getQualifier().setClassid(vocabularyId);
subject.getQualifier().setClassname(vocabulary.getName());
modified.set(true);
}
});
return modified.get();
}
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {
if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) {
Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType());

View File

@ -8,15 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.keyValue;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.util.*;
@ -29,26 +21,7 @@ import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.AccessRight;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ -411,7 +384,7 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
protected abstract List<Subject> prepareSubjects(Document doc, DataInfo info);
protected abstract Qualifier prepareLanguages(Document doc);
@ -559,6 +532,22 @@ public abstract class AbstractMdRecordToOafMapper {
return res;
}
protected List<Subject> prepareSubjectList(
final Node node,
final String xpath,
final DataInfo info) {
final List<Subject> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
res
.add(
subject(
n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"),
n.valueOf("@schemename"), info));
}
return res;
}
protected OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");

View File

@ -84,8 +84,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info);
protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
return prepareSubjectList(doc, "//dc:subject", info);
}
@Override

View File

@ -179,7 +179,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
@ -257,8 +257,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//*[local-name()='subject']", info);
protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
return prepareSubjectList(doc, "//*[local-name()='subject']", info);
}
@Override

View File

@ -251,6 +251,33 @@ public class GraphCleaningFunctionsTest {
pid.getQualifier().getClassname()));
});
assertNotNull(p_cleaned.getSubject());
List<Subject> fos_subjects = p_cleaned
.getSubject()
.stream()
.filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))
.collect(Collectors.toList());
assertNotNull(fos_subjects);
assertEquals(2, fos_subjects.size());
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0101 mathematics".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"sysimport:crosswalk:datasetarchive"
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0102 computer and information sciences".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
// TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_cleaned));
}

View File

@ -928,7 +928,8 @@ class MappersTest {
@Test
void testROHub2() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml")));
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));

View File

@ -743,12 +743,12 @@
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
"classid": "keyword",
"classname": "keyword",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "infrared detectors"
"value": "FOS: Mathematics"
},
{
"dataInfo": {
@ -765,12 +765,12 @@
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
"classid": "keyword",
"classname": "keyword",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "lens antennas"
"value": "FOS: Computer and information sciences"
},
{
"dataInfo": {
@ -787,12 +787,34 @@
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
"classid": "keyword",
"classname": "keyword",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "silicon"
"value": "0101 mathematics"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "subject:fos",
"classname": "subject:fos",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "0101 mathematics"
},
{
"dataInfo": {

View File

@ -1243,4 +1243,6 @@ dnet:relation_relClass @=@ IsSupplementTo @=@ isSupplementTo
dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy
dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo
dnet:relation_subRelType @=@ relationship @=@ publicationDataset
dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned
dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned
FOS @=@ 0101 mathematics @=@ FOS: Mathematics
FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences

View File

@ -1118,4 +1118,7 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ relationship @=@ relat
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version
FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics
FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences
FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences

View File

@ -59,7 +59,7 @@ class ResolveEntitiesTest extends Serializable {
r.setId(id.toLowerCase.trim)
r.setSubject(
List(
OafMapperUtils.structuredProperty(
OafMapperUtils.subject(
FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null
@ -250,7 +250,7 @@ class ResolveEntitiesTest extends Serializable {
val r = new Result
r.setSubject(
List(
OafMapperUtils.structuredProperty(
OafMapperUtils.subject(
FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null

View File

@ -85,7 +85,7 @@ public class IndexRecordTransformerTest {
public void testRiunet() throws IOException, TransformerException {
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation);
XmlConverterJob.schemaLocation);
final Publication p = load("riunet.json", Publication.class);
@ -95,7 +95,6 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record);
}
@Test
public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException {
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml"));
@ -129,8 +128,6 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record);
}
@Test
void testDoiUrlNormalization() throws MalformedURLException {

View File

@ -801,7 +801,7 @@
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.12.2-SNAPSHOT]</dhp-schemas.version>
<dhp-schemas.version>[3.14.0]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>