forked from D-Net/dnet-hadoop
Merge branch 'beta' into clean_country
This commit is contained in:
commit
e45ec15221
|
@ -83,4 +83,10 @@ public class Vocabulary implements Serializable {
|
|||
.orElse(null);
|
||||
}
|
||||
|
||||
public Qualifier lookup(String id) {
|
||||
return Optional
|
||||
.ofNullable(getSynonymAsQualifier(id))
|
||||
.orElse(getTermAsQualifier(id));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -81,6 +81,13 @@ public class VocabularyGroup implements Serializable {
|
|||
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
|
||||
}
|
||||
|
||||
public Optional<Vocabulary> find(final String vocId) {
|
||||
return Optional
|
||||
.ofNullable(vocId)
|
||||
.map(String::toLowerCase)
|
||||
.map(vocs::get);
|
||||
}
|
||||
|
||||
public void addTerm(final String vocId, final String id, final String name) {
|
||||
if (vocabularyExists(vocId)) {
|
||||
vocs.get(vocId.toLowerCase()).addTerm(id, name);
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
|
@ -16,7 +18,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
@ -191,8 +192,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
||||
}
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r
|
||||
.setSubject(
|
||||
List<Subject> subjects = Lists
|
||||
.newArrayList(
|
||||
r
|
||||
.getSubject()
|
||||
.stream()
|
||||
|
@ -201,7 +202,18 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
.collect(
|
||||
Collectors
|
||||
.toMap(
|
||||
s -> Optional
|
||||
.ofNullable(s.getQualifier())
|
||||
.map(q -> q.getClassid() + s.getValue())
|
||||
.orElse(s.getValue()),
|
||||
Function.identity(),
|
||||
(s1, s2) -> Collections
|
||||
.min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator())))
|
||||
.values());
|
||||
r.setSubject(subjects);
|
||||
}
|
||||
if (Objects.nonNull(r.getTitle())) {
|
||||
r
|
||||
|
@ -382,14 +394,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.map(p -> {
|
||||
// hack to distinguish orcid from orcid_pending
|
||||
String pidProvenance = Optional
|
||||
.ofNullable(p.getDataInfo())
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getProvenanceaction())
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
String pidProvenance = getProvenance(p.getDataInfo());
|
||||
if (p
|
||||
.getQualifier()
|
||||
.getClassid()
|
||||
|
@ -520,6 +525,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
return s;
|
||||
}
|
||||
|
||||
protected static Subject cleanValue(Subject s) {
|
||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||
return s;
|
||||
}
|
||||
|
||||
protected static Field<String> cleanValue(Field<String> s) {
|
||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||
return s;
|
||||
|
|
|
@ -14,6 +14,7 @@ import java.util.stream.Collectors;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
|
@ -141,7 +142,7 @@ public class OafMapperUtils {
|
|||
}
|
||||
|
||||
public static Qualifier unknown(final String schemeid, final String schemename) {
|
||||
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
|
||||
return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
|
||||
}
|
||||
|
||||
public static AccessRight accessRight(
|
||||
|
@ -189,6 +190,17 @@ public class OafMapperUtils {
|
|||
return q;
|
||||
}
|
||||
|
||||
public static Subject subject(
|
||||
final String value,
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename,
|
||||
final DataInfo dataInfo) {
|
||||
|
||||
return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(
|
||||
final String value,
|
||||
final String classid,
|
||||
|
@ -200,6 +212,20 @@ public class OafMapperUtils {
|
|||
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
|
||||
}
|
||||
|
||||
public static Subject subject(
|
||||
final String value,
|
||||
final Qualifier qualifier,
|
||||
final DataInfo dataInfo) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
final Subject s = new Subject();
|
||||
s.setValue(value);
|
||||
s.setQualifier(qualifier);
|
||||
s.setDataInfo(dataInfo);
|
||||
return s;
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(
|
||||
final String value,
|
||||
final Qualifier qualifier,
|
||||
|
@ -477,4 +503,15 @@ public class OafMapperUtils {
|
|||
rel.setProperties(properties);
|
||||
return rel;
|
||||
}
|
||||
|
||||
public static String getProvenance(DataInfo dataInfo) {
|
||||
return Optional
|
||||
.ofNullable(dataInfo)
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getProvenanceaction())
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
import static org.apache.commons.lang3.StringUtils.isBlank;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
|
||||
public class SubjectProvenanceComparator implements Comparator<Subject> {
|
||||
|
||||
@Override
|
||||
public int compare(Subject left, Subject right) {
|
||||
|
||||
String lProv = getProvenance(left.getDataInfo());
|
||||
String rProv = getProvenance(right.getDataInfo());
|
||||
|
||||
if (isBlank(lProv) && isBlank(rProv))
|
||||
return 0;
|
||||
if (isBlank(lProv))
|
||||
return 1;
|
||||
if (isBlank(rProv))
|
||||
return -1;
|
||||
if (lProv.equals(rProv))
|
||||
return 0;
|
||||
if (lProv.toLowerCase().contains("crosswalk"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("crosswalk"))
|
||||
return 1;
|
||||
if (lProv.toLowerCase().contains("user"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("user"))
|
||||
return 1;
|
||||
if (lProv.toLowerCase().contains("propagation"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("propagation"))
|
||||
return 1;
|
||||
if (lProv.toLowerCase().contains("iis"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("iis"))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -13,6 +13,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
public class Constants {
|
||||
|
@ -58,13 +59,13 @@ public class Constants {
|
|||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
public static StructuredProperty getSubject(String sbj, String classid, String classname,
|
||||
public static Subject getSubject(String sbj, String classid, String classname,
|
||||
String diqualifierclassid) {
|
||||
if (sbj.equals(NULL))
|
||||
return null;
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(sbj);
|
||||
sp
|
||||
Subject s = new Subject();
|
||||
s.setValue(sbj);
|
||||
s
|
||||
.setQualifier(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
|
@ -72,7 +73,7 @@ public class Constants {
|
|||
classname,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
|
||||
sp
|
||||
s
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
|
@ -88,7 +89,7 @@ public class Constants {
|
|||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
""));
|
||||
|
||||
return sp;
|
||||
return s;
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
|
@ -79,7 +80,7 @@ public class PrepareFOSSparkJob implements Serializable {
|
|||
HashSet<String> level3 = new HashSet<>();
|
||||
addLevels(level1, level2, level3, first);
|
||||
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
|
||||
List<StructuredProperty> sbjs = new ArrayList<>();
|
||||
List<Subject> sbjs = new ArrayList<>();
|
||||
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
|
|
|
@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
|
@ -73,7 +74,7 @@ public class PrepareSDGSparkJob implements Serializable {
|
|||
Result r = new Result();
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
|
||||
SDGDataModel first = it.next();
|
||||
List<StructuredProperty> sbjs = new ArrayList<>();
|
||||
List<Subject> sbjs = new ArrayList<>();
|
||||
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
|
||||
it
|
||||
.forEachRemaining(
|
||||
|
|
|
@ -19,7 +19,7 @@ import java.time.chrono.ThaiBuddhistDate
|
|||
import java.time.format.DateTimeFormatter
|
||||
import java.util.{Date, Locale}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.{Codec, Source}
|
||||
import scala.io.Source
|
||||
|
||||
object DataciteToOAFTransformation {
|
||||
|
||||
|
@ -252,7 +252,7 @@ object DataciteToOAFTransformation {
|
|||
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
if (hosted_by_figshare) {
|
||||
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
||||
val l: List[StructuredProperty] = List()
|
||||
val l: List[Subject] = List()
|
||||
r.setSubject(l.asJava)
|
||||
}
|
||||
}
|
||||
|
@ -492,7 +492,7 @@ object DataciteToOAFTransformation {
|
|||
subjects
|
||||
.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
OafMapperUtils.subject(
|
||||
s.subject.get,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
|
|
|
@ -281,7 +281,7 @@ object BioDBToOAF {
|
|||
d.setSubject(
|
||||
subjects
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
OafMapperUtils.subject(
|
||||
s,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
|
|
|
@ -265,8 +265,8 @@ object PubMedToOaf {
|
|||
result.setLanguage(term)
|
||||
}
|
||||
|
||||
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
val subjects: List[Subject] = article.getSubjects.asScala.map(s =>
|
||||
OafMapperUtils.subject(
|
||||
s.getValue,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
|
|
|
@ -72,7 +72,7 @@ public class ProduceTest {
|
|||
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
List<Subject> sbjs = tmp
|
||||
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
@ -169,7 +169,7 @@ public class ProduceTest {
|
|||
.getSubject()
|
||||
.size());
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
List<Subject> sbjs = tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
@ -396,7 +396,7 @@ public class ProduceTest {
|
|||
.getSubject()
|
||||
.size());
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
List<Subject> sbjs = tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
@ -508,7 +508,7 @@ public class ProduceTest {
|
|||
.getSubject()
|
||||
.size());
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
List<Subject> sbjs = tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
@ -537,7 +537,7 @@ public class ProduceTest {
|
|||
|
||||
JavaRDD<Result> tmp = getResultJavaRDDPlusSDG();
|
||||
|
||||
List<StructuredProperty> sbjs_sdg = tmp
|
||||
List<Subject> sbjs_sdg = tmp
|
||||
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.filter(sbj -> sbj.getQualifier().getClassid().equals(Constants.SDG_CLASS_ID))
|
||||
|
|
|
@ -26,20 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
|||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.ExternalReference;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class ConversionUtils {
|
||||
|
||||
|
@ -71,6 +58,10 @@ public class ConversionUtils {
|
|||
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
|
||||
}
|
||||
|
||||
public static OaBrokerTypedValue oafSubjectToBrokerTypedValue(final Subject sp) {
|
||||
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
|
||||
}
|
||||
|
||||
public static OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
|
||||
if (d == null) {
|
||||
return null;
|
||||
|
@ -115,7 +106,7 @@ public class ConversionUtils {
|
|||
res.setTitles(structPropList(result.getTitle()));
|
||||
res.setAbstracts(fieldList(result.getDescription()));
|
||||
res.setLanguage(classId(result.getLanguage()));
|
||||
res.setSubjects(structPropTypedList(result.getSubject()));
|
||||
res.setSubjects(subjectList(result.getSubject()));
|
||||
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
|
||||
res.setPublicationdate(fieldValue(result.getDateofacceptance()));
|
||||
res.setPublisher(fieldValue(result.getPublisher()));
|
||||
|
@ -304,6 +295,18 @@ public class ConversionUtils {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static List<OaBrokerTypedValue> subjectList(final List<Subject> list) {
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
.map(ConversionUtils::oafSubjectToBrokerTypedValue)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
|
|
|
@ -391,6 +391,28 @@ object DoiBoostMappingUtil {
|
|||
di
|
||||
}
|
||||
|
||||
def createSubject(value: String, classId: String, schemeId: String): Subject = {
|
||||
val s = new Subject
|
||||
s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
|
||||
s.setValue(value)
|
||||
s
|
||||
|
||||
}
|
||||
|
||||
def createSubject(
|
||||
value: String,
|
||||
classId: String,
|
||||
className: String,
|
||||
schemeId: String,
|
||||
schemeName: String
|
||||
): Subject = {
|
||||
val s = new Subject
|
||||
s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
|
||||
s.setValue(value)
|
||||
s
|
||||
|
||||
}
|
||||
|
||||
def createSP(
|
||||
value: String,
|
||||
classId: String,
|
||||
|
|
|
@ -201,7 +201,7 @@ case object Crossref2Oaf {
|
|||
|
||||
if (subjectList.nonEmpty) {
|
||||
result.setSubject(
|
||||
subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
|
||||
subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
|
||||
)
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.mag
|
|||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty, Subject}
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||
import org.json4s
|
||||
|
@ -210,8 +210,8 @@ case object ConversionUtil {
|
|||
val className = "Microsoft Academic Graph classification"
|
||||
val classid = "MAG"
|
||||
|
||||
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
|
||||
val s1 = createSP(
|
||||
val p: List[Subject] = fieldOfStudy.subjects.flatMap(s => {
|
||||
val s1 = createSubject(
|
||||
s.DisplayName,
|
||||
classid,
|
||||
className,
|
||||
|
@ -219,10 +219,10 @@ case object ConversionUtil {
|
|||
ModelConstants.DNET_SUBJECT_TYPOLOGIES
|
||||
)
|
||||
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
|
||||
var resList: List[StructuredProperty] = List(s1)
|
||||
var resList: List[Subject] = List(s1)
|
||||
if (s.MainType.isDefined) {
|
||||
val maintp = s.MainType.get
|
||||
val s2 = createSP(
|
||||
val s2 = createSubject(
|
||||
s.MainType.get,
|
||||
classid,
|
||||
className,
|
||||
|
@ -232,7 +232,7 @@ case object ConversionUtil {
|
|||
s2.setDataInfo(di)
|
||||
resList = resList ::: List(s2)
|
||||
if (maintp.contains(".")) {
|
||||
val s3 = createSP(
|
||||
val s3 = createSubject(
|
||||
maintp.split("\\.").head,
|
||||
classid,
|
||||
className,
|
||||
|
|
|
@ -290,7 +290,7 @@ public class EOSCTagJobTest {
|
|||
.stream()
|
||||
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
List<StructuredProperty> subjects = tmp
|
||||
List<Subject> subjects = tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
|
||||
.collect()
|
||||
.get(0)
|
||||
|
|
|
@ -57,6 +57,11 @@
|
|||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-validator</groupId>
|
||||
<artifactId>commons-validator</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
|
|
|
@ -3,16 +3,15 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
|||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.AccessRight;
|
||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable {
|
||||
|
||||
|
@ -27,9 +26,36 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
|||
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
|
||||
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
|
||||
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
|
||||
mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
|
||||
return mapping;
|
||||
}
|
||||
|
||||
private static void cleanSubject(VocabularyGroup vocabularies, Subject subject) {
|
||||
if (cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, subject)) {
|
||||
return;
|
||||
} else {
|
||||
// TODO cleaning based on different subject vocabs can be added here
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
|
||||
Subject subject) {
|
||||
AtomicReference<Boolean> modified = new AtomicReference<>(false);
|
||||
vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
|
||||
if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
|
||||
return;
|
||||
}
|
||||
Qualifier newValue = vocabulary.lookup(subject.getValue());
|
||||
if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) {
|
||||
subject.setValue(newValue.getClassid());
|
||||
subject.getQualifier().setClassid(vocabularyId);
|
||||
subject.getQualifier().setClassname(vocabulary.getName());
|
||||
modified.set(true);
|
||||
}
|
||||
});
|
||||
return modified.get();
|
||||
}
|
||||
|
||||
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {
|
||||
if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) {
|
||||
Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType());
|
||||
|
|
|
@ -8,20 +8,18 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
|
|||
import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.keyValue;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.validator.routines.UrlValidator;
|
||||
import org.dom4j.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
@ -29,26 +27,7 @@ import com.google.common.collect.Sets;
|
|||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.AccessRight;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
|
@ -77,6 +56,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
protected static final Map<String, String> nsContext = new HashMap<>();
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(AbstractMdRecordToOafMapper.class);
|
||||
|
||||
static {
|
||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||
|
@ -103,10 +84,10 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
this.forceOriginalId = false;
|
||||
}
|
||||
|
||||
public List<Oaf> processMdRecord(final String xml) throws DocumentException {
|
||||
public List<Oaf> processMdRecord(final String xml) {
|
||||
|
||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||
|
||||
try {
|
||||
final Document doc = DocumentHelper
|
||||
.parseText(
|
||||
xml
|
||||
|
@ -137,6 +118,10 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
final String type = getResultType(doc, instances);
|
||||
|
||||
return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||
} catch (DocumentException e) {
|
||||
log.error("Error with record:\n" + xml);
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
}
|
||||
|
||||
protected String getResultType(final Document doc, final List<Instance> instances) {
|
||||
|
@ -411,7 +396,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
|
||||
protected abstract List<Subject> prepareSubjects(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Qualifier prepareLanguages(Document doc);
|
||||
|
||||
|
@ -559,6 +544,22 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected List<Subject> prepareSubjectList(
|
||||
final Node node,
|
||||
final String xpath,
|
||||
final DataInfo info) {
|
||||
final List<Subject> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
res
|
||||
.add(
|
||||
subject(
|
||||
n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"),
|
||||
n.valueOf("@schemename"), info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected OAIProvenance prepareOAIprovenance(final Document doc) {
|
||||
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
|
||||
|
||||
|
@ -620,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected Set<String> validateUrl(Collection<String> url) {
|
||||
UrlValidator urlValidator = UrlValidator.getInstance();
|
||||
if (Objects.isNull(url)) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
return url
|
||||
.stream()
|
||||
.filter(u -> urlValidator.isValid(u))
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -16,6 +16,9 @@ import org.apache.hadoop.io.compress.GzipCodec;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -127,8 +130,8 @@ public class GenerateEntitiesApplication {
|
|||
.sequenceFile(sp, Text.class, Text.class)
|
||||
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
||||
.map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs))
|
||||
.filter(Objects::nonNull)
|
||||
.flatMap(List::iterator));
|
||||
.flatMap(List::iterator)
|
||||
.filter(Objects::nonNull));
|
||||
}
|
||||
|
||||
switch (mode) {
|
||||
|
@ -155,11 +158,11 @@ public class GenerateEntitiesApplication {
|
|||
.saveAsTextFile(targetPath, GzipCodec.class);
|
||||
}
|
||||
|
||||
private static List<Oaf> convertToListOaf(
|
||||
public static List<Oaf> convertToListOaf(
|
||||
final String id,
|
||||
final String s,
|
||||
final boolean shouldHashId,
|
||||
final VocabularyGroup vocs) throws DocumentException {
|
||||
final VocabularyGroup vocs) {
|
||||
final String type = StringUtils.substringAfter(id, ":");
|
||||
|
||||
switch (type.toLowerCase()) {
|
||||
|
@ -200,8 +203,7 @@ public class GenerateEntitiesApplication {
|
|||
try {
|
||||
return OBJECT_MAPPER.readValue(s, clazz);
|
||||
} catch (final Exception e) {
|
||||
log.error("Error parsing object of class: {}", clazz);
|
||||
log.error(s);
|
||||
log.error("Error parsing object of class: {}:\n{}", clazz, s);
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -84,8 +84,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:subject", info);
|
||||
protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareSubjectList(doc, "//dc:subject", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -159,9 +159,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
|
||||
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
||||
instance
|
||||
.setUrl(
|
||||
nodes
|
||||
final List<String> url = nodes
|
||||
.stream()
|
||||
.filter(n -> StringUtils.isNotBlank(n.getText()))
|
||||
.map(n -> n.getText().trim())
|
||||
|
@ -174,7 +172,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
}
|
||||
})
|
||||
.distinct()
|
||||
.collect(Collectors.toCollection(ArrayList::new)));
|
||||
.collect(Collectors.toCollection(ArrayList::new));
|
||||
final Set<String> validUrl = validateUrl(url);
|
||||
if (!validUrl.isEmpty()) {
|
||||
instance.setUrl(new ArrayList<>());
|
||||
instance.getUrl().addAll(validUrl);
|
||||
}
|
||||
|
||||
return Lists.newArrayList(instance);
|
||||
}
|
||||
|
|
|
@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
|||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.validator.routines.UrlValidator;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.Node;
|
||||
|
@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
|
||||
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
||||
}
|
||||
|
||||
Set<String> validUrl = validateUrl(url);
|
||||
|
||||
if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) {
|
||||
for (final Object o : doc
|
||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
|
||||
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
|
||||
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
}
|
||||
if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) {
|
||||
for (final Object o : doc
|
||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
|
||||
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
|
||||
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
if (!url.isEmpty()) {
|
||||
}
|
||||
|
||||
if (!validUrl.isEmpty()) {
|
||||
instance.setUrl(new ArrayList<>());
|
||||
instance.getUrl().addAll(url);
|
||||
instance.getUrl().addAll(validUrl);
|
||||
}
|
||||
return Arrays.asList(instance);
|
||||
}
|
||||
|
@ -257,8 +268,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//*[local-name()='subject']", info);
|
||||
protected List<Subject> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareSubjectList(doc, "//*[local-name()='subject']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class VerifyRecordsApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(VerifyRecordsApplication.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
VerifyRecordsApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String sourcePaths = parser.get("sourcePaths");
|
||||
log.info("sourcePaths: {}", sourcePaths);
|
||||
|
||||
final String invalidPath = parser.get("invalidPath");
|
||||
log.info("invalidPath: {}", invalidPath);
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
|
||||
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
HdfsSupport.remove(invalidPath, spark.sparkContext().hadoopConfiguration());
|
||||
validateRecords(spark, sourcePaths, invalidPath, vocs);
|
||||
});
|
||||
}
|
||||
|
||||
private static void validateRecords(SparkSession spark, String sourcePaths, String invalidPath,
|
||||
VocabularyGroup vocs) {
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
final List<String> existingSourcePaths = Arrays
|
||||
.stream(sourcePaths.split(","))
|
||||
.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
log.info("Verify records in files:");
|
||||
existingSourcePaths.forEach(log::info);
|
||||
|
||||
for (final String sp : existingSourcePaths) {
|
||||
RDD<String> invalidRecords = sc
|
||||
.sequenceFile(sp, Text.class, Text.class)
|
||||
.map(k -> tryApplyMapping(k._1().toString(), k._2().toString(), true, vocs))
|
||||
.filter(Objects::nonNull)
|
||||
.rdd();
|
||||
spark
|
||||
.createDataset(invalidRecords, Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.text(invalidPath);
|
||||
}
|
||||
}
|
||||
|
||||
private static String tryApplyMapping(
|
||||
final String id,
|
||||
final String xmlRecord,
|
||||
final boolean shouldHashId,
|
||||
final VocabularyGroup vocs) {
|
||||
|
||||
final List<Oaf> oaf = GenerateEntitiesApplication.convertToListOaf(id, xmlRecord, shouldHashId, vocs);
|
||||
if (Optional.ofNullable(oaf).map(List::isEmpty).orElse(false)) {
|
||||
return xmlRecord;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
|
|||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -24,8 +25,11 @@ import org.apache.http.impl.client.HttpClients;
|
|||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.OafToOafMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.OdfToOafMapper;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class AbstractMigrationApplication implements Closeable {
|
||||
|
|
|
@ -446,10 +446,34 @@
|
|||
<join name="wait_import" to="fork_generate_entities"/>
|
||||
|
||||
<fork name="fork_generate_entities">
|
||||
<path start="GenerateEntities_claim"/>
|
||||
<path start="GenerateEntities"/>
|
||||
<path start="VerifyRecords_claim"/>
|
||||
<path start="VerifyRecords"/>
|
||||
</fork>
|
||||
|
||||
<action name="VerifyRecords_claim">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>VerifyRecords_claim</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
|
||||
<arg>--invalidPath</arg><arg>${workingDir}/invalid_records_claim</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="GenerateEntities_claim"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateEntities_claim">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -499,6 +523,30 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="VerifyRecords">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>VerifyRecords</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePaths</arg><arg>${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible</arg>
|
||||
<arg>--invalidPath</arg><arg>${workingDir}/invalid_records</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="GenerateEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePaths",
|
||||
"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "invalidPath",
|
||||
"paramDescription": "the path of the invalid records file",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "isu",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "the url of the ISLookupService",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -251,6 +251,33 @@ public class GraphCleaningFunctionsTest {
|
|||
pid.getQualifier().getClassname()));
|
||||
});
|
||||
|
||||
assertNotNull(p_cleaned.getSubject());
|
||||
|
||||
List<Subject> fos_subjects = p_cleaned
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
assertNotNull(fos_subjects);
|
||||
assertEquals(2, fos_subjects.size());
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
.stream()
|
||||
.anyMatch(
|
||||
s -> "0101 mathematics".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
|
||||
"sysimport:crosswalk:datasetarchive"
|
||||
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
.stream()
|
||||
.anyMatch(
|
||||
s -> "0102 computer and information sciences".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
|
||||
|
||||
// TODO add more assertions to verity the cleaned values
|
||||
System.out.println(MAPPER.writeValueAsString(p_cleaned));
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
|||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
|
@ -928,7 +927,8 @@ class MappersTest {
|
|||
|
||||
@Test
|
||||
void testROHub2() throws IOException, DocumentException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml")));
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
|
@ -947,6 +947,15 @@ class MappersTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNotWellFormed() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
|
||||
final List<Oaf> actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
assertNotNull(actual);
|
||||
assertTrue(actual.isEmpty());
|
||||
}
|
||||
|
||||
private void assertValidId(final String id) {
|
||||
// System.out.println(id);
|
||||
|
||||
|
|
|
@ -251,6 +251,18 @@ class MigrateDbEntitiesApplicationTest {
|
|||
assertValidId(r2.getSource());
|
||||
assertEquals(r1.getSource(), r2.getTarget());
|
||||
assertEquals(r2.getSource(), r1.getTarget());
|
||||
|
||||
assertTrue(r1.getSource().startsWith("10|"));
|
||||
assertTrue(r1.getTarget().startsWith("20|"));
|
||||
|
||||
assertEquals(ModelConstants.DATASOURCE_ORGANIZATION, r1.getRelType());
|
||||
assertEquals(ModelConstants.DATASOURCE_ORGANIZATION, r2.getRelType());
|
||||
|
||||
assertEquals(ModelConstants.PROVISION, r1.getSubRelType());
|
||||
assertEquals(ModelConstants.PROVISION, r2.getSubRelType());
|
||||
|
||||
assertEquals(ModelConstants.IS_PROVIDED_BY, r1.getRelClass());
|
||||
assertEquals(ModelConstants.PROVIDES, r2.getRelClass());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -743,12 +743,12 @@
|
|||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "infrared detectors"
|
||||
"value": "FOS: Mathematics"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
|
@ -765,12 +765,12 @@
|
|||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "lens antennas"
|
||||
"value": "FOS: Computer and information sciences"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
|
@ -787,12 +787,34 @@
|
|||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "silicon"
|
||||
"value": "0101 mathematics"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "subject:fos",
|
||||
"classname": "subject:fos",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "0101 mathematics"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
|
|
|
@ -1244,3 +1244,5 @@ dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy
|
|||
dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo
|
||||
dnet:relation_subRelType @=@ relationship @=@ publicationDataset
|
||||
dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned
|
||||
FOS @=@ 0101 mathematics @=@ FOS: Mathematics
|
||||
FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences
|
|
@ -1119,3 +1119,6 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review
|
|||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version
|
||||
FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics
|
||||
FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences
|
||||
FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences
|
|
@ -0,0 +1,70 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header xmlns="http://namespace.openaire.eu/">
|
||||
<dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
|
||||
<dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
|
||||
<dri:dateOfCollection/>
|
||||
<dri:mdFormat/>
|
||||
<dri:mdFormatInterpretation/>
|
||||
<dri:repositoryId/>
|
||||
<dr:objectIdentifier/>
|
||||
<dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
|
||||
<dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
|
||||
<oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
|
||||
</header>
|
||||
<metadata xmlns="http://namespace.openaire.eu/">
|
||||
<dc:title>多項式GCDを用いた復号法に関する研究<dc:title>
|
||||
<dc:creator>上原, 剛</dc:creator>
|
||||
<dc:creator>甲斐, 博</dc:creator>
|
||||
<dc:creator>野田, 松太郎</dc:creator>
|
||||
<dc:format>application/pdf</dc:format>
|
||||
<dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
|
||||
<dc:language>jpn</dc:language>
|
||||
<dc:publisher>京都大学数理解析研究所</dc:publisher>
|
||||
<dc:subject classid="ndc" classname="ndc"
|
||||
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
|
||||
<dc:type>Departmental Bulletin Paper</dc:type>
|
||||
<dr:CobjCategory type="publication">0014</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
|
||||
<oaf:projectid/>
|
||||
<oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
|
||||
<oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
|
||||
<oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
|
||||
<oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
|
||||
<oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
|
||||
<oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
|
||||
<oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
|
||||
</metadata>
|
||||
<about>
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
|
||||
<baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
|
||||
<identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
|
||||
<datestamp>2021-04-13T13:36:29Z</datestamp>
|
||||
<metadataNamespace/>
|
||||
<originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
|
||||
<baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
|
||||
<identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
|
||||
<datestamp>2012-07-12T14:15:41Z</datestamp>
|
||||
<metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
|
||||
</originDescription>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
|
||||
classname="sysimport:crosswalk:repository"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</record>
|
|
@ -0,0 +1,8 @@
|
|||
# Root logger option
|
||||
log4j.rootLogger=DEBUG, stdout
|
||||
|
||||
# Direct log messages to stdout
|
||||
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
|
||||
log4j.appender.stdout.Target=System.out
|
||||
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
|
|
@ -59,7 +59,7 @@ class ResolveEntitiesTest extends Serializable {
|
|||
r.setId(id.toLowerCase.trim)
|
||||
r.setSubject(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
OafMapperUtils.subject(
|
||||
FAKE_SUBJECT,
|
||||
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||
null
|
||||
|
@ -250,7 +250,7 @@ class ResolveEntitiesTest extends Serializable {
|
|||
val r = new Result
|
||||
r.setSubject(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
OafMapperUtils.subject(
|
||||
FAKE_SUBJECT,
|
||||
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||
null
|
||||
|
|
|
@ -95,7 +95,6 @@ public class IndexRecordTransformerTest {
|
|||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException {
|
||||
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml"));
|
||||
|
@ -129,8 +128,6 @@ public class IndexRecordTransformerTest {
|
|||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
void testDoiUrlNormalization() throws MalformedURLException {
|
||||
|
||||
|
|
|
@ -76,11 +76,11 @@ compute stats indi_result_with_orcid;
|
|||
|
||||
---- Sprint 3 ----
|
||||
create table indi_funded_result_with_fundref stored as parquet as
|
||||
select distinct r.id, coalesce(fundref, 0) as fundref
|
||||
select distinct r.result as id, coalesce(fundref, 0) as fundref
|
||||
from project_results r
|
||||
left outer join (select distinct id, 1 as fundref from project_results
|
||||
left outer join (select distinct result, 1 as fundref from project_results
|
||||
where provenance='Harvested') tmp
|
||||
on r.id= tmp.id;
|
||||
on r.result= tmp.result;
|
||||
|
||||
compute stats indi_funded_result_with_fundref;
|
||||
|
||||
|
@ -179,17 +179,17 @@ from publication_datasources pd
|
|||
|
||||
compute stats indi_pub_diamond;
|
||||
|
||||
create table indi_pub_hybrid stored as parquet as
|
||||
select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
|
||||
from publication_datasources pd
|
||||
left outer join (
|
||||
select pd.id, 1 as is_hybrid from publication_datasources pd
|
||||
join datasource d on d.id=pd.datasource
|
||||
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
||||
and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
|
||||
on pd.id=tmp.id;
|
||||
|
||||
compute stats indi_pub_hybrid;
|
||||
--create table indi_pub_hybrid stored as parquet as
|
||||
--select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
|
||||
--from publication_datasources pd
|
||||
-- left outer join (
|
||||
-- select pd.id, 1 as is_hybrid from publication_datasources pd
|
||||
-- join datasource d on d.id=pd.datasource
|
||||
-- join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
|
||||
-- and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
|
||||
-- on pd.id=tmp.id;
|
||||
--
|
||||
--compute stats indi_pub_hybrid;
|
||||
|
||||
create table indi_pub_in_transformative stored as parquet as
|
||||
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
|
||||
|
@ -564,12 +564,12 @@ create table indi_org_fairness stored as parquet as
|
|||
(select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
|
||||
join result r on r.id=ro.id
|
||||
--join result_pids rp on r.id=rp.id
|
||||
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003
|
||||
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003
|
||||
group by ro.organization),
|
||||
--return all results group by organization
|
||||
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
|
||||
join result r on r.id=ro.id
|
||||
where year>2003
|
||||
where cast(year as int)>2003
|
||||
group by organization)
|
||||
--return results_fair/all_results
|
||||
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
|
@ -638,11 +638,11 @@ create table indi_org_fairness_year stored as parquet as
|
|||
(select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
|
||||
join result r on r.id=ro.id
|
||||
join result_pids rp on r.id=rp.id
|
||||
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003
|
||||
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003
|
||||
group by ro.organization, year),
|
||||
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
|
||||
join result r on r.id=ro.id
|
||||
where year>2003
|
||||
where cast(year as int)>2003
|
||||
group by organization, year)
|
||||
--return results_fair/all_results
|
||||
select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
|
@ -657,12 +657,12 @@ create table indi_org_findable_year stored as parquet as
|
|||
(select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
|
||||
join result_pids rp on rp.id=ro.id
|
||||
join result r on r.id=rp.id
|
||||
where year >2003
|
||||
where cast(year as int) >2003
|
||||
group by ro.organization, year),
|
||||
--return all results group by organization,year
|
||||
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
|
||||
join result r on r.id=ro.id
|
||||
where year >2003
|
||||
where cast(year as int) >2003
|
||||
group by organization, year)
|
||||
--return results_with_pid/all_results
|
||||
select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
|
@ -677,12 +677,12 @@ create table indi_org_findable stored as parquet as
|
|||
(select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
|
||||
join result_pids rp on rp.id=ro.id
|
||||
join result r on r.id=rp.id
|
||||
where year >2003
|
||||
where cast(year as int) >2003
|
||||
group by ro.organization),
|
||||
--return all results group by organization
|
||||
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
|
||||
join result r on r.id=ro.id
|
||||
where year >2003
|
||||
where cast(year as int) >2003
|
||||
group by organization)
|
||||
--return results_with_pid/all_results
|
||||
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
|
|
|
@ -3,20 +3,20 @@
|
|||
----------------------------------------------------
|
||||
|
||||
-- Peer reviewed:
|
||||
create table ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
|
||||
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
|
||||
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id;
|
||||
|
||||
-- Green OA:
|
||||
create table ${stats_db_name}.result_greenoa STORED AS PARQUET as
|
||||
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
|
||||
select r.id, case when green.green_oa=1 then true else false end as green
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id;
|
||||
|
||||
-- GOLD OA:
|
||||
create table ${stats_db_name}.result_gold STORED AS PARQUET as
|
||||
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
|
||||
select r.id, case when gold.is_gold=1 then true else false end as gold
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id;
|
|
@ -45,7 +45,10 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||
'openorgs____::4ac562f0376fce3539504567649cb373' -- University of Patras
|
||||
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3' -- École des Ponts ParisTech
|
||||
) )) foo;
|
||||
compute stats TARGET.result;
|
||||
|
||||
|
@ -159,10 +162,10 @@ create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * fro
|
|||
compute stats TARGET.indi_pub_doi_from_crossref;
|
||||
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_gold_oa;
|
||||
create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_datasets_gold_oa;
|
||||
create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_software_gold_oa;
|
||||
--create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--compute stats TARGET.indi_datasets_gold_oa;
|
||||
--create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--compute stats TARGET.indi_software_gold_oa;
|
||||
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_has_abstract;
|
||||
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
|
8
pom.xml
8
pom.xml
|
@ -200,6 +200,12 @@
|
|||
<version>${dhp.commons.lang.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-validator</groupId>
|
||||
<artifactId>commons-validator</artifactId>
|
||||
<version>1.7</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.github.sisyphsu</groupId>
|
||||
<artifactId>dateparser</artifactId>
|
||||
|
@ -801,7 +807,7 @@
|
|||
<mockito-core.version>3.3.3</mockito-core.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
<dhp-schemas.version>[2.12.2-SNAPSHOT]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[3.14.0]</dhp-schemas.version>
|
||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
|
|
Loading…
Reference in New Issue