subjects cleaning #239
|
@ -83,4 +83,10 @@ public class Vocabulary implements Serializable {
|
|||
.orElse(null);
|
||||
}
|
||||
|
||||
public Qualifier lookup(String id) {
|
||||
return Optional
|
||||
.ofNullable(getSynonymAsQualifier(id))
|
||||
.orElse(getTermAsQualifier(id));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -81,6 +81,13 @@ public class VocabularyGroup implements Serializable {
|
|||
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
|
||||
}
|
||||
|
||||
public Optional<Vocabulary> find(final String vocId) {
|
||||
return Optional
|
||||
.ofNullable(vocId)
|
||||
.map(String::toLowerCase)
|
||||
.map(vocs::get);
|
||||
}
|
||||
|
||||
public void addTerm(final String vocId, final String id, final String name) {
|
||||
if (vocabularyExists(vocId)) {
|
||||
vocs.get(vocId.toLowerCase()).addTerm(id, name);
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
|
@ -16,7 +18,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
@ -191,8 +192,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
||||
}
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r
|
||||
.setSubject(
|
||||
List<Subject> subjects = Lists
|
||||
.newArrayList(
|
||||
r
|
||||
.getSubject()
|
||||
.stream()
|
||||
|
@ -201,7 +202,18 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
.collect(
|
||||
Collectors
|
||||
.toMap(
|
||||
s -> Optional
|
||||
.ofNullable(s.getQualifier())
|
||||
.map(q -> q.getClassid() + s.getValue())
|
||||
.orElse(s.getValue()),
|
||||
Function.identity(),
|
||||
(s1, s2) -> Collections
|
||||
.min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator())))
|
||||
.values());
|
||||
r.setSubject(subjects);
|
||||
}
|
||||
if (Objects.nonNull(r.getTitle())) {
|
||||
r
|
||||
|
@ -382,14 +394,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.map(p -> {
|
||||
// hack to distinguish orcid from orcid_pending
|
||||
String pidProvenance = Optional
|
||||
.ofNullable(p.getDataInfo())
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getProvenanceaction())
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
String pidProvenance = getProvenance(p.getDataInfo());
|
||||
if (p
|
||||
.getQualifier()
|
||||
.getClassid()
|
||||
|
@ -520,6 +525,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
return s;
|
||||
}
|
||||
|
||||
protected static Subject cleanValue(Subject s) {
|
||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||
return s;
|
||||
}
|
||||
|
||||
protected static Field<String> cleanValue(Field<String> s) {
|
||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||
return s;
|
||||
|
|
|
@ -14,6 +14,7 @@ import java.util.stream.Collectors;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
|
@ -141,7 +142,7 @@ public class OafMapperUtils {
|
|||
}
|
||||
|
||||
public static Qualifier unknown(final String schemeid, final String schemename) {
|
||||
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
|
||||
return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
|
||||
}
|
||||
|
||||
public static AccessRight accessRight(
|
||||
|
@ -189,6 +190,17 @@ public class OafMapperUtils {
|
|||
return q;
|
||||
}
|
||||
|
||||
public static Subject subject(
|
||||
final String value,
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename,
|
||||
final DataInfo dataInfo) {
|
||||
|
||||
return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(
|
||||
final String value,
|
||||
final String classid,
|
||||
|
@ -200,6 +212,20 @@ public class OafMapperUtils {
|
|||
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
|
||||
}
|
||||
|
||||
public static Subject subject(
|
||||
final String value,
|
||||
final Qualifier qualifier,
|
||||
final DataInfo dataInfo) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
final Subject s = new Subject();
|
||||
s.setValue(value);
|
||||
s.setQualifier(qualifier);
|
||||
s.setDataInfo(dataInfo);
|
||||
return s;
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(
|
||||
final String value,
|
||||
final Qualifier qualifier,
|
||||
|
@ -477,4 +503,15 @@ public class OafMapperUtils {
|
|||
rel.setProperties(properties);
|
||||
return rel;
|
||||
}
|
||||
|
||||
public static String getProvenance(DataInfo dataInfo) {
|
||||
return Optional
|
||||
.ofNullable(dataInfo)
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getProvenanceaction())
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
import static org.apache.commons.lang3.StringUtils.isBlank;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
|
||||
public class SubjectProvenanceComparator implements Comparator<Subject> {
|
||||
|
||||
@Override
|
||||
public int compare(Subject left, Subject right) {
|
||||
|
||||
String lProv = getProvenance(left.getDataInfo());
|
||||
String rProv = getProvenance(right.getDataInfo());
|
||||
|
||||
if (isBlank(lProv) && isBlank(rProv))
|
||||
return 0;
|
||||
if (isBlank(lProv))
|
||||
return 1;
|
||||
if (isBlank(rProv))
|
||||
return -1;
|
||||
if (lProv.equals(rProv))
|
||||
return 0;
|
||||
if (lProv.toLowerCase().contains("crosswalk"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("crosswalk"))
|
||||
return 1;
|
||||
if (lProv.toLowerCase().contains("user"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("user"))
|
||||
return 1;
|
||||
if (lProv.toLowerCase().contains("propagation"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("propagation"))
|
||||
return 1;
|
||||
if (lProv.toLowerCase().contains("iis"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("iis"))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -13,6 +13,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
public class Constants {
|
||||
|
@ -58,13 +59,13 @@ public class Constants {
|
|||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
public static StructuredProperty getSubject(String sbj, String classid, String classname,
|
||||
public static Subject getSubject(String sbj, String classid, String classname,
|
||||
String diqualifierclassid) {
|
||||
if (sbj.equals(NULL))
|
||||
return null;
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(sbj);
|
||||
sp
|
||||
Subject s = new Subject();
|
||||
s.setValue(sbj);
|
||||
s
|
||||
.setQualifier(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
|
@ -72,7 +73,7 @@ public class Constants {
|
|||
classname,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
|
||||
sp
|
||||
s
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
|
@ -88,7 +89,7 @@ public class Constants {
|
|||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
""));
|
||||
|
||||
return sp;
|
||||
return s;
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
|
@ -79,7 +80,7 @@ public class PrepareFOSSparkJob implements Serializable {
|
|||
HashSet<String> level3 = new HashSet<>();
|
||||
addLevels(level1, level2, level3, first);
|
||||
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
|
||||
List<StructuredProperty> sbjs = new ArrayList<>();
|
||||
List<Subject> sbjs = new ArrayList<>();
|
||||
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
|
|
|
@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
|
@ -73,7 +74,7 @@ public class PrepareSDGSparkJob implements Serializable {
|
|||
Result r = new Result();
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
|
||||
SDGDataModel first = it.next();
|
||||
List<StructuredProperty> sbjs = new ArrayList<>();
|
||||
List<Subject> sbjs = new ArrayList<>();
|
||||
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
|
||||
it
|
||||
.forEachRemaining(
|
||||
|
|
|
@ -19,7 +19,7 @@ import java.time.chrono.ThaiBuddhistDate
|
|||
import java.time.format.DateTimeFormatter
|
||||
import java.util.{Date, Locale}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.{Codec, Source}
|
||||
import scala.io.Source
|
||||
|
||||
object DataciteToOAFTransformation {
|
||||
|
||||
|
@ -252,7 +252,7 @@ object DataciteToOAFTransformation {
|
|||
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
if (hosted_by_figshare) {
|
||||
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
||||
val l: List[StructuredProperty] = List()
|
||||
val l: List[Subject] = List()
|
||||
r.setSubject(l.asJava)
|
||||
}
|
||||
}
|
||||
|
@ -492,7 +492,7 @@ object DataciteToOAFTransformation {
|
|||
subjects
|
||||
.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
OafMapperUtils.subject(
|
||||
s.subject.get,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
|
|
|
@ -281,7 +281,7 @@ object BioDBToOAF {
|
|||
d.setSubject(
|
||||
subjects
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
OafMapperUtils.subject(
|
||||
s,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
|
|
|
@ -265,8 +265,8 @@ object PubMedToOaf {
|
|||
result.setLanguage(term)
|
||||
}
|
||||
|
||||
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
val subjects: List[Subject] = article.getSubjects.asScala.map(s =>
|
||||
OafMapperUtils.subject(
|
||||
s.getValue,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
|
|
|
@ -72,7 +72,7 @@ public class ProduceTest {
|
|||
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
List<Subject> sbjs = tmp
|
||||
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
@ -169,7 +169,7 @@ public class ProduceTest {
|
|||
.getSubject()
|
||||
.size());
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
List<Subject> sbjs = tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
@ -396,7 +396,7 @@ public class ProduceTest {
|
|||
.getSubject()
|
||||
.size());
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
List<Subject> sbjs = tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
@ -508,7 +508,7 @@ public class ProduceTest {
|
|||
.getSubject()
|
||||
.size());
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
List<Subject> sbjs = tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
@ -537,7 +537,7 @@ public class ProduceTest {
|
|||
|
||||
JavaRDD<Result> tmp = getResultJavaRDDPlusSDG();
|
||||
|
||||
List<StructuredProperty> sbjs_sdg = tmp
|
||||
List<Subject> sbjs_sdg = tmp
|
||||
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.filter(sbj -> sbj.getQualifier().getClassid().equals(Constants.SDG_CLASS_ID))
|
||||
|
|
|
@ -26,20 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
|||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.ExternalReference;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class ConversionUtils {
|
||||
|
||||
|
@ -71,6 +58,10 @@ public class ConversionUtils {
|
|||
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
|
||||
}
|
||||
|
||||
public static OaBrokerTypedValue oafSubjectToBrokerTypedValue(final Subject sp) {
|
||||
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
|
||||
}
|
||||
|
||||
public static OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
|
||||
if (d == null) {
|
||||
return null;
|
||||
|
@ -115,7 +106,7 @@ public class ConversionUtils {
|
|||
res.setTitles(structPropList(result.getTitle()));
|
||||
res.setAbstracts(fieldList(result.getDescription()));
|
||||
res.setLanguage(classId(result.getLanguage()));
|
||||
res.setSubjects(structPropTypedList(result.getSubject()));
|
||||
res.setSubjects(subjectList(result.getSubject()));
|
||||
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
|
||||
res.setPublicationdate(fieldValue(result.getDateofacceptance()));
|
||||
res.setPublisher(fieldValue(result.getPublisher()));
|
||||
|
@ -304,6 +295,18 @@ public class ConversionUtils {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static List<OaBrokerTypedValue> subjectList(final List<Subject> list) {
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
.map(ConversionUtils::oafSubjectToBrokerTypedValue)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
|
|
|
@ -391,6 +391,28 @@ object DoiBoostMappingUtil {
|
|||
di
|
||||
}
|
||||
|
||||
def createSubject(value: String, classId: String, schemeId: String): Subject = {
|
||||
val s = new Subject
|
||||
s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
|
||||
s.setValue(value)
|
||||
s
|
||||
|
||||
}
|
||||
|
||||
def createSubject(
|
||||
value: String,
|
||||
classId: String,
|
||||
className: String,
|
||||
schemeId: String,
|
||||
schemeName: String
|
||||
): Subject = {
|
||||
val s = new Subject
|
||||
s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
|
||||
s.setValue(value)
|
||||
s
|
||||
|
||||
}
|
||||
|
||||
def createSP(
|
||||
value: String,
|
||||
classId: String,
|
||||
|
|
|
@ -201,7 +201,7 @@ case object Crossref2Oaf {
|
|||
|
||||
if (subjectList.nonEmpty) {
|
||||
result.setSubject(
|
||||
subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
|
||||
subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
|
||||
)
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.mag
|
|||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty, Subject}
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||
import org.json4s
|
||||
|
@ -210,8 +210,8 @@ case object ConversionUtil {
|
|||
val className = "Microsoft Academic Graph classification"
|
||||
val classid = "MAG"
|
||||
|
||||
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
|
||||
val s1 = createSP(
|
||||
val p: List[Subject] = fieldOfStudy.subjects.flatMap(s => {
|
||||
val s1 = createSubject(
|
||||
s.DisplayName,
|
||||
classid,
|
||||
className,
|
||||
|
@ -219,10 +219,10 @@ case object ConversionUtil {
|
|||
ModelConstants.DNET_SUBJECT_TYPOLOGIES
|
||||
)
|
||||
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
|
||||
var resList: List[StructuredProperty] = List(s1)
|
||||
var resList: List[Subject] = List(s1)
|
||||
if (s.MainType.isDefined) {
|
||||
val maintp = s.MainType.get
|
||||
val s2 = createSP(
|
||||
val s2 = createSubject(
|
||||
s.MainType.get,
|
||||
classid,
|
||||
className,
|
||||
|
@ -232,7 +232,7 @@ case object ConversionUtil {
|
|||
s2.setDataInfo(di)
|
||||
resList = resList ::: List(s2)
|
||||
if (maintp.contains(".")) {
|
||||
val s3 = createSP(
|
||||
val s3 = createSubject(
|
||||
maintp.split("\\.").head,
|
||||
classid,
|
||||
className,
|
||||
|
|
|
@ -290,7 +290,7 @@ public class EOSCTagJobTest {
|
|||
.stream()
|
||||
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
List<StructuredProperty> subjects = tmp
|
||||
List<Subject> subjects = tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
|
||||
.collect()
|
||||
.get(0)
|
||||
|
|
|
@ -3,16 +3,15 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
|||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.AccessRight;
|
||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable {
|
||||
|
||||
|
@ -27,9 +26,36 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
|||
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
|
||||
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
|
||||
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
|
||||
mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
|
||||
return mapping;
|
||||
}
|
||||
|
||||
private static void cleanSubject(VocabularyGroup vocabularies, Subject subject) {
|
||||
if (cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, subject)) {
|
||||
return;
|
||||
} else {
|
||||
// TODO cleaning based on different subject vocabs can be added here
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
|
||||
Subject subject) {
|
||||
AtomicReference<Boolean> modified = new AtomicReference<>(false);
|
||||
vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
|
||||
if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
|
||||
return;
|
||||
}
|
||||
Qualifier newValue = vocabulary.lookup(subject.getValue());
|
||||
if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) {
|
||||
|
||||
subject.setValue(newValue.getClassid());
|
||||
subject.getQualifier().setClassid(vocabularyId);
|
||||
subject.getQualifier().setClassname(vocabulary.getName());
|
||||
modified.set(true);
|
||||
}
|
||||
});
|
||||
return modified.get();
|
||||
}
|
||||
|
||||
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {
|
||||
if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) {
|
||||
Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType());
|
||||
|
|
|
@ -8,15 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
|
|||
import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.keyValue;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
@ -29,26 +21,7 @@ import com.google.common.collect.Sets;
|
|||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.AccessRight;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
|
@ -411,7 +384,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
|
||||
protected abstract List<Subject> prepareSubjects(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Qualifier prepareLanguages(Document doc);
|
||||
|
||||
|
@ -559,6 +532,22 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected List<Subject> prepareSubjectList(
|
||||
final Node node,
|
||||
final String xpath,
|
||||
final DataInfo info) {
|
||||
final List<Subject> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
res
|
||||
.add(
|
||||
subject(
|
||||
n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"),
|
||||
n.valueOf("@schemename"), info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected OAIProvenance prepareOAIprovenance(final Document doc) {
|
||||
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
|
||||
|
||||
|
|
|
@ -84,8 +84,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:subject", info);
|
||||
protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareSubjectList(doc, "//dc:subject", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -257,8 +257,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//*[local-name()='subject']", info);
|
||||
protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareSubjectList(doc, "//*[local-name()='subject']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -251,6 +251,33 @@ public class GraphCleaningFunctionsTest {
|
|||
pid.getQualifier().getClassname()));
|
||||
});
|
||||
|
||||
assertNotNull(p_cleaned.getSubject());
|
||||
|
||||
List<Subject> fos_subjects = p_cleaned
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
assertNotNull(fos_subjects);
|
||||
assertEquals(2, fos_subjects.size());
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
.stream()
|
||||
.anyMatch(
|
||||
s -> "0101 mathematics".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
|
||||
"sysimport:crosswalk:datasetarchive"
|
||||
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
.stream()
|
||||
.anyMatch(
|
||||
s -> "0102 computer and information sciences".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
|
||||
|
||||
// TODO add more assertions to verity the cleaned values
|
||||
System.out.println(MAPPER.writeValueAsString(p_cleaned));
|
||||
}
|
||||
|
|
|
@ -928,7 +928,8 @@ class MappersTest {
|
|||
|
||||
@Test
|
||||
void testROHub2() throws IOException, DocumentException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml")));
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
|
|
|
@ -743,12 +743,12 @@
|
|||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "infrared detectors"
|
||||
"value": "FOS: Mathematics"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
|
@ -765,12 +765,12 @@
|
|||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "lens antennas"
|
||||
"value": "FOS: Computer and information sciences"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
|
@ -787,12 +787,34 @@
|
|||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "silicon"
|
||||
"value": "0101 mathematics"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "subject:fos",
|
||||
"classname": "subject:fos",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "0101 mathematics"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
|
|
|
@ -1244,3 +1244,5 @@ dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy
|
|||
dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo
|
||||
dnet:relation_subRelType @=@ relationship @=@ publicationDataset
|
||||
dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned
|
||||
FOS @=@ 0101 mathematics @=@ FOS: Mathematics
|
||||
FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences
|
|
@ -1119,3 +1119,6 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review
|
|||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version
|
||||
FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics
|
||||
FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences
|
||||
FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences
|
|
@ -59,7 +59,7 @@ class ResolveEntitiesTest extends Serializable {
|
|||
r.setId(id.toLowerCase.trim)
|
||||
r.setSubject(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
OafMapperUtils.subject(
|
||||
FAKE_SUBJECT,
|
||||
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||
null
|
||||
|
@ -250,7 +250,7 @@ class ResolveEntitiesTest extends Serializable {
|
|||
val r = new Result
|
||||
r.setSubject(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
OafMapperUtils.subject(
|
||||
FAKE_SUBJECT,
|
||||
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||
null
|
||||
|
|
|
@ -95,7 +95,6 @@ public class IndexRecordTransformerTest {
|
|||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException {
|
||||
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml"));
|
||||
|
@ -129,8 +128,6 @@ public class IndexRecordTransformerTest {
|
|||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
void testDoiUrlNormalization() throws MalformedURLException {
|
||||
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -801,7 +801,7 @@
|
|||
<mockito-core.version>3.3.3</mockito-core.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
<dhp-schemas.version>[2.12.2-SNAPSHOT]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[3.14.0]</dhp-schemas.version>
|
||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
|
|
Loading…
Reference in New Issue
I do not understand why you compare subject.value and newValue.classid
I saw by myself. It is ok.
One thing: you will not change the classId of the subject if the value provided is equal to the term in the vocabulary.
You are right. However, the
vocabulary.lookup
method already tryes to find match a synonym and in case it can't then it looks for a matching equivalent term:Then, in case a matching term cannot be found, the method returns a qualifier set to UNKNOWN
hence I could just exploit this to decide what to do when the value provided is equal to the term in the vocabulary.
Yes, I do agree