conflict resolution in the comparator test class
This commit is contained in:
commit
c97facf5e6
|
@ -28,3 +28,4 @@ spark-warehouse
|
||||||
/**/.scalafmt.conf
|
/**/.scalafmt.conf
|
||||||
/.java-version
|
/.java-version
|
||||||
/dhp-shade-package/dependency-reduced-pom.xml
|
/dhp-shade-package/dependency-reduced-pom.xml
|
||||||
|
/**/job.properties
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.personentity;
|
package eu.dnetlib.dhp.common.person;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
@ -61,7 +61,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
|
||||||
private Relation getRelation(String orcid1, String orcid2) {
|
private Relation getRelation(String orcid1, String orcid2) {
|
||||||
String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
|
String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
|
||||||
String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
|
String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
|
||||||
return OafMapperUtils
|
Relation relation = OafMapperUtils
|
||||||
.getRelation(
|
.getRelation(
|
||||||
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
|
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
|
||||||
ModelConstants.PERSON_PERSON_SUBRELTYPE,
|
ModelConstants.PERSON_PERSON_SUBRELTYPE,
|
||||||
|
@ -76,5 +76,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
"0.91"),
|
"0.91"),
|
||||||
null);
|
null);
|
||||||
|
relation.setValidated(true);
|
||||||
|
return relation;
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,12 +1,9 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.personentity;
|
package eu.dnetlib.dhp.common.person;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
|
|
||||||
public class Coauthors implements Serializable {
|
public class Coauthors implements Serializable {
|
||||||
private List<String> coauthors;
|
private List<String> coauthors;
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
package eu.dnetlib.dhp.oa.merge;
|
package eu.dnetlib.dhp.oa.merge;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import static org.apache.spark.sql.functions.col;
|
import static org.apache.spark.sql.functions.*;
|
||||||
import static org.apache.spark.sql.functions.when;
|
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
@ -135,7 +134,9 @@ public class GroupEntitiesSparkJob {
|
||||||
.applyCoarVocabularies(entity, vocs),
|
.applyCoarVocabularies(entity, vocs),
|
||||||
OAFENTITY_KRYO_ENC)
|
OAFENTITY_KRYO_ENC)
|
||||||
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
||||||
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
|
.mapGroups(
|
||||||
|
(MapGroupsFunction<String, OafEntity, OafEntity>) (key, group) -> MergeUtils.mergeById(group, vocs),
|
||||||
|
OAFENTITY_KRYO_ENC)
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||||
t.getClass().getName(), t),
|
t.getClass().getName(), t),
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.OPENAIRE_META_RESOURCE_TYPE;
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||||
|
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
|
@ -363,6 +362,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
// nothing to clean here
|
// nothing to clean here
|
||||||
} else if (value instanceof Project) {
|
} else if (value instanceof Project) {
|
||||||
// nothing to clean here
|
// nothing to clean here
|
||||||
|
} else if (value instanceof Person) {
|
||||||
|
// nothing to clean here
|
||||||
} else if (value instanceof Organization) {
|
} else if (value instanceof Organization) {
|
||||||
Organization o = (Organization) value;
|
Organization o = (Organization) value;
|
||||||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
||||||
|
@ -694,6 +695,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// set ORCID_PENDING to all orcid values that are not coming from ORCID provenance
|
||||||
for (Author a : r.getAuthor()) {
|
for (Author a : r.getAuthor()) {
|
||||||
if (Objects.isNull(a.getPid())) {
|
if (Objects.isNull(a.getPid())) {
|
||||||
a.setPid(Lists.newArrayList());
|
a.setPid(Lists.newArrayList());
|
||||||
|
@ -750,6 +752,40 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Identify clashing ORCIDS:that is same ORCID associated to multiple authors in this result
|
||||||
|
Map<String, Integer> clashing_orcid = new HashMap<>();
|
||||||
|
|
||||||
|
for (Author a : r.getAuthor()) {
|
||||||
|
a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
p -> StringUtils
|
||||||
|
.contains(StringUtils.lowerCase(p.getQualifier().getClassid()), ORCID_PENDING))
|
||||||
|
.map(StructuredProperty::getValue)
|
||||||
|
.distinct()
|
||||||
|
.forEach(orcid -> clashing_orcid.compute(orcid, (k, v) -> (v == null) ? 1 : v + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> clashing = clashing_orcid
|
||||||
|
.entrySet()
|
||||||
|
.stream()
|
||||||
|
.filter(ee -> ee.getValue() > 1)
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
// filter out clashing orcids
|
||||||
|
for (Author a : r.getAuthor()) {
|
||||||
|
a
|
||||||
|
.setPid(
|
||||||
|
a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(p -> !clashing.contains(p.getValue()))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
if (value instanceof Publication) {
|
if (value instanceof Publication) {
|
||||||
|
|
||||||
|
@ -808,7 +844,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
return author;
|
return author;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
|
public static Optional<String> cleanDateField(Field<String> dateofacceptance) {
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(dateofacceptance)
|
.ofNullable(dateofacceptance)
|
||||||
.map(Field::getValue)
|
.map(Field::getValue)
|
||||||
|
|
|
@ -204,6 +204,7 @@ public class IdentifierFactory implements Serializable {
|
||||||
.map(
|
.map(
|
||||||
pp -> pp
|
pp -> pp
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||||
// filter away PIDs provided by a DS that is not considered an authority for the
|
// filter away PIDs provided by a DS that is not considered an authority for the
|
||||||
// given PID Type
|
// given PID Type
|
||||||
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
|
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
|
||||||
|
|
|
@ -23,24 +23,30 @@ import org.apache.commons.lang3.tuple.Pair;
|
||||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
||||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||||
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class MergeUtils {
|
public class MergeUtils {
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
|
public static <T extends Oaf> T mergeById(Iterator<T> oafEntityIterator, VocabularyGroup vocs) {
|
||||||
return mergeGroup(s, oafEntityIterator, true);
|
return mergeGroup(oafEntityIterator, true, vocs);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
|
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator) {
|
||||||
return mergeGroup(s, oafEntityIterator, false);
|
return mergeGroup(oafEntityIterator, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
|
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator, boolean checkDelegateAuthority) {
|
||||||
boolean checkDelegateAuthority) {
|
return mergeGroup(oafEntityIterator, checkDelegateAuthority, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator,
|
||||||
|
boolean checkDelegateAuthority, VocabularyGroup vocs) {
|
||||||
|
|
||||||
ArrayList<T> sortedEntities = new ArrayList<>();
|
ArrayList<T> sortedEntities = new ArrayList<>();
|
||||||
oafEntityIterator.forEachRemaining(sortedEntities::add);
|
oafEntityIterator.forEachRemaining(sortedEntities::add);
|
||||||
|
@ -49,13 +55,55 @@ public class MergeUtils {
|
||||||
Iterator<T> it = sortedEntities.iterator();
|
Iterator<T> it = sortedEntities.iterator();
|
||||||
T merged = it.next();
|
T merged = it.next();
|
||||||
|
|
||||||
while (it.hasNext()) {
|
if (!it.hasNext() && merged instanceof Result && vocs != null) {
|
||||||
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
return enforceResultType(vocs, (Result) merged);
|
||||||
|
} else {
|
||||||
|
while (it.hasNext()) {
|
||||||
|
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return merged;
|
return merged;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> T enforceResultType(VocabularyGroup vocs, Result mergedResult) {
|
||||||
|
if (Optional.ofNullable(mergedResult.getInstance()).map(List::isEmpty).orElse(true)) {
|
||||||
|
return (T) mergedResult;
|
||||||
|
} else {
|
||||||
|
final Instance i = mergedResult.getInstance().get(0);
|
||||||
|
|
||||||
|
if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
||||||
|
return (T) mergedResult;
|
||||||
|
} else {
|
||||||
|
final String expectedResultType = Optional
|
||||||
|
.ofNullable(
|
||||||
|
vocs
|
||||||
|
.lookupTermBySynonym(
|
||||||
|
ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid()))
|
||||||
|
.orElse(ModelConstants.ORP_DEFAULT_RESULTTYPE)
|
||||||
|
.getClassid();
|
||||||
|
|
||||||
|
// there is a clash among the result types
|
||||||
|
if (!expectedResultType.equals(mergedResult.getResulttype().getClassid())) {
|
||||||
|
|
||||||
|
Result result = (Result) Optional
|
||||||
|
.ofNullable(ModelSupport.oafTypes.get(expectedResultType))
|
||||||
|
.map(r -> {
|
||||||
|
try {
|
||||||
|
return r.newInstance();
|
||||||
|
} catch (InstantiationException | IllegalAccessException e) {
|
||||||
|
throw new IllegalStateException(e);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.orElse(new OtherResearchProduct());
|
||||||
|
result.setId(mergedResult.getId());
|
||||||
|
return (T) mergeResultFields(result, mergedResult);
|
||||||
|
} else {
|
||||||
|
return (T) mergedResult;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
|
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
|
||||||
return (T) merge(left, right, checkDelegateAuthority);
|
return (T) merge(left, right, checkDelegateAuthority);
|
||||||
}
|
}
|
||||||
|
@ -106,7 +154,7 @@ public class MergeUtils {
|
||||||
return mergeSoftware((Software) left, (Software) right);
|
return mergeSoftware((Software) left, (Software) right);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mergeResultFields((Result) left, (Result) right);
|
return left;
|
||||||
} else if (sameClass(left, right, Datasource.class)) {
|
} else if (sameClass(left, right, Datasource.class)) {
|
||||||
// TODO
|
// TODO
|
||||||
final int trust = compareTrust(left, right);
|
final int trust = compareTrust(left, right);
|
||||||
|
@ -654,16 +702,9 @@ public class MergeUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
|
private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
|
||||||
if (d1 == null || StringUtils.isBlank(d1.getValue())) {
|
if (!GraphCleaningFunctions.cleanDateField(d1).isPresent()) {
|
||||||
return d2;
|
return d2;
|
||||||
} else if (d2 == null || StringUtils.isBlank(d2.getValue())) {
|
} else if (!GraphCleaningFunctions.cleanDateField(d2).isPresent()) {
|
||||||
return d1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (StringUtils.contains(d1.getValue(), "null")) {
|
|
||||||
return d2;
|
|
||||||
}
|
|
||||||
if (StringUtils.contains(d2.getValue(), "null")) {
|
|
||||||
return d1;
|
return d1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -715,7 +756,11 @@ public class MergeUtils {
|
||||||
private static String spKeyExtractor(StructuredProperty sp) {
|
private static String spKeyExtractor(StructuredProperty sp) {
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(sp)
|
.ofNullable(sp)
|
||||||
.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
|
.map(
|
||||||
|
s -> Joiner
|
||||||
|
.on("||")
|
||||||
|
.useForNull("")
|
||||||
|
.join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,12 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
|
||||||
public class ModelHardLimits {
|
public class ModelHardLimits {
|
||||||
|
|
||||||
private ModelHardLimits() {
|
private ModelHardLimits() {
|
||||||
|
@ -12,6 +18,7 @@ public class ModelHardLimits {
|
||||||
|
|
||||||
public static final int MAX_EXTERNAL_ENTITIES = 50;
|
public static final int MAX_EXTERNAL_ENTITIES = 50;
|
||||||
public static final int MAX_AUTHORS = 200;
|
public static final int MAX_AUTHORS = 200;
|
||||||
|
public static final int MAX_RELATED_AUTHORS = 20;
|
||||||
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
||||||
public static final int MAX_TITLE_LENGTH = 5000;
|
public static final int MAX_TITLE_LENGTH = 5000;
|
||||||
public static final int MAX_TITLES = 10;
|
public static final int MAX_TITLES = 10;
|
||||||
|
@ -19,6 +26,12 @@ public class ModelHardLimits {
|
||||||
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
||||||
public static final int MAX_RELATED_ABSTRACT_LENGTH = 500;
|
public static final int MAX_RELATED_ABSTRACT_LENGTH = 500;
|
||||||
public static final int MAX_INSTANCES = 10;
|
public static final int MAX_INSTANCES = 10;
|
||||||
|
public static final Map<String, Long> MAX_RELATIONS_BY_RELCLASS = Maps.newHashMap();
|
||||||
|
|
||||||
|
static {
|
||||||
|
MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.PERSON_PERSON_HASCOAUTHORED, 500L);
|
||||||
|
MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.RESULT_PERSON_HASAUTHORED, 500L);
|
||||||
|
}
|
||||||
|
|
||||||
public static String getCollectionName(String format) {
|
public static String getCollectionName(String format) {
|
||||||
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
|
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
|
||||||
|
|
|
@ -26,7 +26,7 @@ public class PidCleaner {
|
||||||
String value = Optional
|
String value = Optional
|
||||||
.ofNullable(pidValue)
|
.ofNullable(pidValue)
|
||||||
.map(String::trim)
|
.map(String::trim)
|
||||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
.orElseThrow(() -> new IllegalArgumentException("PID (" + pidType + ") value cannot be empty"));
|
||||||
|
|
||||||
switch (pidType) {
|
switch (pidType) {
|
||||||
|
|
||||||
|
|
|
@ -179,7 +179,7 @@ class OafMapperUtilsTest {
|
||||||
assertEquals(
|
assertEquals(
|
||||||
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
||||||
((Result) MergeUtils
|
((Result) MergeUtils
|
||||||
.merge(p2, d1))
|
.merge(p2, d1, true))
|
||||||
.getResulttype()
|
.getResulttype()
|
||||||
.getClassid());
|
.getClassid());
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ public class NumAuthorsTitleSuffixPrefixChain extends AbstractClusteringFunction
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(Config conf, String s) {
|
protected Collection<String> doApply(Config conf, String s) {
|
||||||
return suffixPrefixChain(cleanup(s), param("mod"));
|
return suffixPrefixChain(cleanup(s), paramOrDefault("mod", 10));
|
||||||
}
|
}
|
||||||
|
|
||||||
private Collection<String> suffixPrefixChain(String s, int mod) {
|
private Collection<String> suffixPrefixChain(String s, int mod) {
|
||||||
|
|
|
@ -90,7 +90,7 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
inferFrom = normalize(inferFrom);
|
inferFrom = normalize(inferFrom);
|
||||||
inferFrom = filterAllStopWords(inferFrom);
|
inferFrom = filterAllStopWords(inferFrom);
|
||||||
Set<String> cities = getCities(inferFrom, 4);
|
Set<String> cities = getCities(inferFrom, 4);
|
||||||
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
|
return citiesToCountry(cities).stream().filter(Objects::nonNull).findFirst().orElse("UNKNOWN");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String cityInference(String original) {
|
public static String cityInference(String original) {
|
||||||
|
|
|
@ -54,6 +54,22 @@ public class FieldDef implements Serializable {
|
||||||
public FieldDef() {
|
public FieldDef() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public FieldDef clone() {
|
||||||
|
FieldDef fieldDef = new FieldDef();
|
||||||
|
fieldDef.setName(this.name);
|
||||||
|
fieldDef.setPath(this.path);
|
||||||
|
fieldDef.setType(this.type);
|
||||||
|
fieldDef.setOverrideMatch(this.overrideMatch);
|
||||||
|
fieldDef.setSize(this.size);
|
||||||
|
fieldDef.setLength(this.length);
|
||||||
|
fieldDef.setFilter(this.filter);
|
||||||
|
fieldDef.setSorted(this.sorted);
|
||||||
|
fieldDef.setClean(this.clean);
|
||||||
|
fieldDef.setInfer(this.infer);
|
||||||
|
fieldDef.setInferenceFrom(this.inferenceFrom);
|
||||||
|
return fieldDef;
|
||||||
|
}
|
||||||
|
|
||||||
public String getInferenceFrom() {
|
public String getInferenceFrom() {
|
||||||
return inferenceFrom;
|
return inferenceFrom;
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,48 +19,10 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable {
|
||||||
val model: SparkModel = SparkModel(conf)
|
val model: SparkModel = SparkModel(conf)
|
||||||
|
|
||||||
val dedup: (Dataset[Row] => Dataset[Row]) = df => {
|
val dedup: (Dataset[Row] => Dataset[Row]) = df => {
|
||||||
df.transform(filterAndCleanup)
|
df.transform(generateClustersWithCollect)
|
||||||
.transform(generateClustersWithCollect)
|
|
||||||
.transform(processBlocks)
|
.transform(processBlocks)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => {
|
|
||||||
val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => {
|
|
||||||
if (conf.blacklists.containsKey(fdef.getName)) {
|
|
||||||
res.withColumn(
|
|
||||||
fdef.getName + "_filtered",
|
|
||||||
filterColumnUDF(fdef).apply(new Column(fdef.getName))
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
res
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
df_with_filters
|
|
||||||
}
|
|
||||||
|
|
||||||
def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
|
|
||||||
val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
|
|
||||||
|
|
||||||
if (blacklist == null) {
|
|
||||||
throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter")
|
|
||||||
} else {
|
|
||||||
fdef.getType match {
|
|
||||||
case Type.List | Type.JSON =>
|
|
||||||
udf[Array[String], Array[String]](values => {
|
|
||||||
values.filter((v: String) => !blacklist.test(v))
|
|
||||||
})
|
|
||||||
|
|
||||||
case _ =>
|
|
||||||
udf[String, String](v => {
|
|
||||||
if (blacklist.test(v)) ""
|
|
||||||
else v
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => {
|
val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => {
|
||||||
var df_with_clustering_keys: Dataset[Row] = null
|
var df_with_clustering_keys: Dataset[Row] = null
|
||||||
|
|
||||||
|
|
|
@ -5,12 +5,12 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions
|
||||||
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||||
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
|
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
|
||||||
import org.apache.commons.lang3.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
|
||||||
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
||||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||||
import org.apache.spark.sql.{Dataset, Row}
|
import org.apache.spark.sql.{Dataset, Row}
|
||||||
|
|
||||||
import java.util.Locale
|
import java.util.Locale
|
||||||
|
import java.util.function.Predicate
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
|
@ -29,8 +29,20 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
identifier.setName(identifierFieldName)
|
identifier.setName(identifierFieldName)
|
||||||
identifier.setType(Type.String)
|
identifier.setType(Type.String)
|
||||||
|
|
||||||
|
// create fields for blacklist
|
||||||
|
val filtered = conf.getPace.getModel.asScala.flatMap(fdef => {
|
||||||
|
if (conf.blacklists().containsKey(fdef.getName)) {
|
||||||
|
val fdef_filtered = fdef.clone()
|
||||||
|
fdef_filtered.setName(fdef.getName + "_filtered")
|
||||||
|
Seq(fdef, fdef_filtered)
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Seq(fdef)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
// Construct a Spark StructType representing the schema of the model
|
// Construct a Spark StructType representing the schema of the model
|
||||||
(Seq(identifier) ++ conf.getPace.getModel.asScala)
|
(Seq(identifier) ++ filtered)
|
||||||
.foldLeft(
|
.foldLeft(
|
||||||
new StructType()
|
new StructType()
|
||||||
)((resType, fieldDef) => {
|
)((resType, fieldDef) => {
|
||||||
|
@ -44,7 +56,6 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)
|
val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)
|
||||||
|
@ -52,7 +63,8 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
||||||
|
|
||||||
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
||||||
df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
|
df
|
||||||
|
.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
|
||||||
}
|
}
|
||||||
|
|
||||||
def rowFromJson(json: String): Row = {
|
def rowFromJson(json: String): Row = {
|
||||||
|
@ -64,41 +76,63 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
|
|
||||||
schema.fieldNames.zipWithIndex.foldLeft(values) {
|
schema.fieldNames.zipWithIndex.foldLeft(values) {
|
||||||
case ((res, (fname, index))) =>
|
case ((res, (fname, index))) =>
|
||||||
val fdef = conf.getPace.getModelMap.get(fname)
|
|
||||||
|
val fdef = conf.getPace.getModelMap.get(fname.split("_filtered")(0))
|
||||||
|
|
||||||
if (fdef != null) {
|
if (fdef != null) {
|
||||||
res(index) = fdef.getType match {
|
if (!fname.contains("_filtered")) { //process fields with no blacklist
|
||||||
case Type.String | Type.Int =>
|
res(index) = fdef.getType match {
|
||||||
MapDocumentUtil.truncateValue(
|
case Type.String | Type.Int =>
|
||||||
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
MapDocumentUtil.truncateValue(
|
||||||
fdef.getLength
|
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||||
)
|
fdef.getLength
|
||||||
|
)
|
||||||
|
|
||||||
case Type.URL =>
|
case Type.URL =>
|
||||||
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
|
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
|
||||||
if (!URL_REGEX.matcher(uv).matches)
|
if (!URL_REGEX.matcher(uv).matches)
|
||||||
uv = ""
|
uv = ""
|
||||||
uv
|
uv
|
||||||
|
|
||||||
case Type.List | Type.JSON =>
|
case Type.List | Type.JSON =>
|
||||||
MapDocumentUtil.truncateList(
|
MapDocumentUtil.truncateList(
|
||||||
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||||
fdef.getSize
|
fdef.getSize
|
||||||
).asScala
|
).asScala
|
||||||
|
|
||||||
case Type.StringConcat =>
|
case Type.StringConcat =>
|
||||||
val jpaths = CONCAT_REGEX.split(fdef.getPath)
|
val jpaths = CONCAT_REGEX.split(fdef.getPath)
|
||||||
|
|
||||||
MapDocumentUtil.truncateValue(
|
MapDocumentUtil.truncateValue(
|
||||||
jpaths
|
jpaths
|
||||||
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
|
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
|
||||||
.mkString(" "),
|
.mkString(" "),
|
||||||
fdef.getLength
|
fdef.getLength
|
||||||
)
|
)
|
||||||
|
|
||||||
case Type.DoubleArray =>
|
case Type.DoubleArray =>
|
||||||
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
else { //process fields with blacklist
|
||||||
|
val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
|
||||||
|
|
||||||
|
res(index) = fdef.getType match {
|
||||||
|
case Type.List | Type.JSON =>
|
||||||
|
MapDocumentUtil.truncateList(
|
||||||
|
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||||
|
fdef.getSize
|
||||||
|
).asScala.filter((v: String) => !blacklist.test(v))
|
||||||
|
|
||||||
|
case _ =>
|
||||||
|
val value: String = MapDocumentUtil.truncateValue(
|
||||||
|
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||||
|
fdef.getLength
|
||||||
|
)
|
||||||
|
if (blacklist.test(value)) "" else value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
val filter = fdef.getFilter
|
val filter = fdef.getFilter
|
||||||
|
|
||||||
|
@ -125,13 +159,12 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (StringUtils.isNotBlank(fdef.getInfer)) {
|
if (StringUtils.isNotBlank(fdef.getInfer)) {
|
||||||
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
|
val inferFrom: String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
|
||||||
res(index) = res(index) match {
|
res(index) = res(index) match {
|
||||||
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
|
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
|
||||||
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
|
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
res
|
res
|
||||||
|
@ -139,6 +172,7 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
}
|
}
|
||||||
|
|
||||||
new GenericRowWithSchema(values, schema)
|
new GenericRowWithSchema(values, schema)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def clean(value: String, cleantype: String) : String = {
|
def clean(value: String, cleantype: String) : String = {
|
||||||
|
|
|
@ -227,4 +227,17 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNumAuthorsTitleSuffixPrefixChain() {
|
||||||
|
|
||||||
|
final ClusteringFunction cf = new NumAuthorsTitleSuffixPrefixChain(params);
|
||||||
|
params.put("mod", 10);
|
||||||
|
|
||||||
|
final String title = "PARP-2 Regulates SIRT1 Expression and Whole-Body Energy Expenditure";
|
||||||
|
final String num_authors = "10";
|
||||||
|
System.out.println("title = " + title);
|
||||||
|
System.out.println("num_authors = " + num_authors);
|
||||||
|
System.out.println(cf.apply(conf, Lists.newArrayList(num_authors, title)));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.common;
|
package eu.dnetlib.pace.common;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
|
|
||||||
|
@ -54,8 +53,17 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
|
||||||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test()
|
||||||
|
public void countryInferenceTest_NPE() {
|
||||||
|
assertThrows(
|
||||||
|
NullPointerException.class,
|
||||||
|
() -> countryInference("UNKNOWN", null),
|
||||||
|
"Expected countryInference() to throw an NPE");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void countryInferenceTest() {
|
public void countryInferenceTest() {
|
||||||
|
assertEquals("UNKNOWN", countryInference("UNKNOWN", ""));
|
||||||
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
|
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
|
||||||
assertEquals("UK", countryInference("UK", "Università di Bologna"));
|
assertEquals("UK", countryInference("UK", "Università di Bologna"));
|
||||||
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
|
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
|
||||||
|
|
|
@ -367,7 +367,18 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
result = dateRange.distance("invalid date", "2021-05-02", conf);
|
result = dateRange.distance("invalid date", "2021-05-02", conf);
|
||||||
assertEquals(-1.0, result);
|
assertEquals(-1.0, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void titleVersionMatchTest() {
|
||||||
|
|
||||||
|
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||||
|
|
||||||
|
double result = titleVersionMatch
|
||||||
|
.compare(
|
||||||
|
"parp 2 regulates sirt 1 expression and whole body energy expenditure",
|
||||||
|
"parp 2 regulates sirt 1 expression and whole body energy expenditure", conf);
|
||||||
|
assertEquals(1.0, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,6 @@ import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
|
||||||
|
|
||||||
public class UtilTest {
|
public class UtilTest {
|
||||||
|
|
||||||
|
|
|
@ -151,12 +151,17 @@ public class PromoteActionPayloadForGraphTableJob {
|
||||||
SparkSession spark, String path, Class<G> rowClazz) {
|
SparkSession spark, String path, Class<G> rowClazz) {
|
||||||
logger.info("Reading graph table from path: {}", path);
|
logger.info("Reading graph table from path: {}", path);
|
||||||
|
|
||||||
return spark
|
if (HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) {
|
||||||
.read()
|
return spark
|
||||||
.textFile(path)
|
.read()
|
||||||
.map(
|
.textFile(path)
|
||||||
(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
|
.map(
|
||||||
Encoders.bean(rowClazz));
|
(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
|
||||||
|
Encoders.bean(rowClazz));
|
||||||
|
} else {
|
||||||
|
logger.info("Found empty graph table from path: {}", path);
|
||||||
|
return spark.emptyDataset(Encoders.bean(rowClazz));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <A extends Oaf> Dataset<A> readActionPayload(
|
private static <A extends Oaf> Dataset<A> readActionPayload(
|
||||||
|
@ -223,7 +228,7 @@ public class PromoteActionPayloadForGraphTableJob {
|
||||||
rowClazz,
|
rowClazz,
|
||||||
actionPayloadClazz);
|
actionPayloadClazz);
|
||||||
|
|
||||||
if (shouldGroupById) {
|
if (Boolean.TRUE.equals(shouldGroupById)) {
|
||||||
return PromoteActionPayloadFunctions
|
return PromoteActionPayloadFunctions
|
||||||
.groupGraphTableByIdAndMerge(
|
.groupGraphTableByIdAndMerge(
|
||||||
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
|
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
|
||||||
|
@ -250,6 +255,8 @@ public class PromoteActionPayloadForGraphTableJob {
|
||||||
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation());
|
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation());
|
||||||
case "eu.dnetlib.dhp.schema.oaf.Software":
|
case "eu.dnetlib.dhp.schema.oaf.Software":
|
||||||
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software());
|
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software());
|
||||||
|
case "eu.dnetlib.dhp.schema.oaf.Person":
|
||||||
|
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Person());
|
||||||
default:
|
default:
|
||||||
throw new RuntimeException("unknown class: " + clazz.getCanonicalName());
|
throw new RuntimeException("unknown class: " + clazz.getCanonicalName());
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,7 +50,7 @@ public class PromoteActionPayloadFunctions {
|
||||||
PromoteAction.Strategy promoteActionStrategy,
|
PromoteAction.Strategy promoteActionStrategy,
|
||||||
Class<G> rowClazz,
|
Class<G> rowClazz,
|
||||||
Class<A> actionPayloadClazz) {
|
Class<A> actionPayloadClazz) {
|
||||||
if (!isSubClass(rowClazz, actionPayloadClazz)) {
|
if (Boolean.FALSE.equals(isSubClass(rowClazz, actionPayloadClazz))) {
|
||||||
throw new RuntimeException(
|
throw new RuntimeException(
|
||||||
"action payload type must be the same or be a super type of table row type");
|
"action payload type must be the same or be a super type of table row type");
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,3 +7,4 @@ promote_action_payload_for_project_table classpath eu/dnetlib/dhp/actionmanager/
|
||||||
promote_action_payload_for_publication_table classpath eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app
|
promote_action_payload_for_publication_table classpath eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app
|
||||||
promote_action_payload_for_relation_table classpath eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app
|
promote_action_payload_for_relation_table classpath eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app
|
||||||
promote_action_payload_for_software_table classpath eu/dnetlib/dhp/actionmanager/wf/software/oozie_app
|
promote_action_payload_for_software_table classpath eu/dnetlib/dhp/actionmanager/wf/software/oozie_app
|
||||||
|
promote_action_payload_for_person_table classpath eu/dnetlib/dhp/actionmanager/wf/person/oozie_app
|
||||||
|
|
|
@ -148,6 +148,7 @@
|
||||||
<path start="PromoteActionPayloadForPublicationTable"/>
|
<path start="PromoteActionPayloadForPublicationTable"/>
|
||||||
<path start="PromoteActionPayloadForRelationTable"/>
|
<path start="PromoteActionPayloadForRelationTable"/>
|
||||||
<path start="PromoteActionPayloadForSoftwareTable"/>
|
<path start="PromoteActionPayloadForSoftwareTable"/>
|
||||||
|
<path start="PromoteActionPayloadForPersonTable"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
<action name="PromoteActionPayloadForDatasetTable">
|
<action name="PromoteActionPayloadForDatasetTable">
|
||||||
|
@ -270,6 +271,21 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="PromoteActionPayloadForPersonTable">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/promote_action_payload_for_person_table</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>inputActionPayloadRootPath</name>
|
||||||
|
<value>${workingDir}/action_payload_by_type</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="JoinPromote"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<join name="JoinPromote" to="End"/>
|
<join name="JoinPromote" to="End"/>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
<workflow-app name="promote_action_payload_for_person_table" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>activePromotePersonActionPayload</name>
|
||||||
|
<description>when true will promote actions with eu.dnetlib.dhp.schema.oaf.Person payload</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>inputGraphRootPath</name>
|
||||||
|
<description>root location of input materialized graph</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>inputActionPayloadRootPath</name>
|
||||||
|
<description>root location of action payloads to promote</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputGraphRootPath</name>
|
||||||
|
<description>root location for output materialized graph</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>mergeAndGetStrategy</name>
|
||||||
|
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="DecisionPromotePersonActionPayload"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<decision name="DecisionPromotePersonActionPayload">
|
||||||
|
<switch>
|
||||||
|
<case to="PromotePersonActionPayloadForPersonTable">
|
||||||
|
${(activePromotePersonActionPayload eq "true") and
|
||||||
|
(fs:exists(concat(concat(concat(concat(wf:conf('nameNode'),'/'),wf:conf('inputActionPayloadRootPath')),'/'),'clazz=eu.dnetlib.dhp.schema.oaf.Person')) eq "true")}
|
||||||
|
</case>
|
||||||
|
<default to="SkipPromotePersonActionPayloadForPersonTable"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="PromotePersonActionPayloadForPersonTable">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>PromotePersonActionPayloadForPersonTable</name>
|
||||||
|
<class>eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob</class>
|
||||||
|
<jar>dhp-actionmanager-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/person</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||||
|
<arg>--inputActionPayloadPath</arg><arg>${inputActionPayloadRootPath}/clazz=eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||||
|
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||||
|
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/person</arg>
|
||||||
|
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||||
|
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="SkipPromotePersonActionPayloadForPersonTable">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<prepare>
|
||||||
|
<delete path="${outputGraphRootPath}/person"/>
|
||||||
|
</prepare>
|
||||||
|
<arg>-pb</arg>
|
||||||
|
<arg>${inputGraphRootPath}/person</arg>
|
||||||
|
<arg>${outputGraphRootPath}/person</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -34,7 +34,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates action sets for Crossref affiliation relations inferred by BIP!
|
* Creates action sets for Crossref affiliation relations inferred by OpenAIRE
|
||||||
*/
|
*/
|
||||||
public class PrepareAffiliationRelations implements Serializable {
|
public class PrepareAffiliationRelations implements Serializable {
|
||||||
|
|
||||||
|
@ -104,22 +104,22 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
|
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
|
||||||
spark, crossrefInputPath, collectedfromOpenAIRE);
|
spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":crossref");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||||
spark, pubmedInputPath, collectedfromOpenAIRE);
|
spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":pubmed");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
|
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
|
||||||
spark, openapcInputPath, collectedfromOpenAIRE);
|
spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":openapc");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelationsNewModel(
|
||||||
spark, dataciteInputPath, collectedfromOpenAIRE);
|
spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":datacite");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelationsNewModel(
|
||||||
spark, webcrawlInputPath, collectedfromOpenAIRE);
|
spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":rawaff");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
|
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisherNewModel(
|
||||||
spark, publisherlInputPath, collectedfromOpenAIRE);
|
spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":webcrawl");
|
||||||
|
|
||||||
crossrefRelations
|
crossrefRelations
|
||||||
.union(pubmedRelations)
|
.union(pubmedRelations)
|
||||||
|
@ -133,7 +133,8 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
|
|
||||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
|
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
|
||||||
String inputPath,
|
String inputPath,
|
||||||
List<KeyValue> collectedfrom) {
|
List<KeyValue> collectedfrom,
|
||||||
|
String dataprovenance) {
|
||||||
|
|
||||||
Dataset<Row> df = spark
|
Dataset<Row> df = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -142,12 +143,13 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
.json(inputPath)
|
.json(inputPath)
|
||||||
.where("DOI is not null");
|
.where("DOI is not null");
|
||||||
|
|
||||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
return getTextTextJavaPairRDDNew(
|
||||||
|
collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
||||||
List<KeyValue> collectedfrom) {
|
List<KeyValue> collectedfrom, String dataprovenance) {
|
||||||
|
|
||||||
Dataset<Row> df = spark
|
Dataset<Row> df = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -155,13 +157,14 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
.json(inputPath)
|
.json(inputPath)
|
||||||
.where("DOI is not null");
|
.where("DOI is not null");
|
||||||
|
|
||||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
return getTextTextJavaPairRDD(
|
||||||
|
collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
||||||
String inputPath,
|
String inputPath,
|
||||||
List<KeyValue> collectedfrom) {
|
List<KeyValue> collectedfrom, String dataprovenance) {
|
||||||
|
|
||||||
// load and parse affiliation relations from HDFS
|
// load and parse affiliation relations from HDFS
|
||||||
Dataset<Row> df = spark
|
Dataset<Row> df = spark
|
||||||
|
@ -170,12 +173,12 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
.json(inputPath)
|
.json(inputPath)
|
||||||
.where("DOI is not null");
|
.where("DOI is not null");
|
||||||
|
|
||||||
return getTextTextJavaPairRDD(collectedfrom, df);
|
return getTextTextJavaPairRDD(collectedfrom, df, dataprovenance);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
|
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
|
||||||
String inputPath,
|
String inputPath,
|
||||||
List<KeyValue> collectedfrom) {
|
List<KeyValue> collectedfrom, String dataprovenance) {
|
||||||
// load and parse affiliation relations from HDFS
|
// load and parse affiliation relations from HDFS
|
||||||
Dataset<Row> df = spark
|
Dataset<Row> df = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -184,10 +187,11 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
.json(inputPath)
|
.json(inputPath)
|
||||||
.where("DOI is not null");
|
.where("DOI is not null");
|
||||||
|
|
||||||
return getTextTextJavaPairRDDNew(collectedfrom, df);
|
return getTextTextJavaPairRDDNew(collectedfrom, df, dataprovenance);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
|
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df,
|
||||||
|
String dataprovenance) {
|
||||||
// unroll nested arrays
|
// unroll nested arrays
|
||||||
df = df
|
df = df
|
||||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||||
|
@ -219,7 +223,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
DataInfo dataInfo = OafMapperUtils
|
DataInfo dataInfo = OafMapperUtils
|
||||||
.dataInfo(
|
.dataInfo(
|
||||||
false,
|
false,
|
||||||
BIP_INFERENCE_PROVENANCE,
|
dataprovenance,
|
||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
qualifier,
|
qualifier,
|
||||||
|
@ -235,7 +239,8 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
|
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df,
|
||||||
|
String dataprovenance) {
|
||||||
// unroll nested arrays
|
// unroll nested arrays
|
||||||
df = df
|
df = df
|
||||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||||
|
@ -276,7 +281,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
DataInfo dataInfo = OafMapperUtils
|
DataInfo dataInfo = OafMapperUtils
|
||||||
.dataInfo(
|
.dataInfo(
|
||||||
false,
|
false,
|
||||||
BIP_INFERENCE_PROVENANCE,
|
dataprovenance,
|
||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
qualifier,
|
qualifier,
|
||||||
|
|
|
@ -2,21 +2,31 @@
|
||||||
package eu.dnetlib.dhp.actionmanager.personentity;
|
package eu.dnetlib.dhp.actionmanager.personentity;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import static org.apache.spark.sql.functions.*;
|
|
||||||
|
|
||||||
|
import java.io.BufferedWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.sql.ResultSet;
|
||||||
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.BZip2Codec;
|
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.*;
|
import org.apache.spark.api.java.function.*;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -28,13 +38,14 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.collection.orcid.model.Author;
|
import eu.dnetlib.dhp.collection.orcid.model.Author;
|
||||||
import eu.dnetlib.dhp.collection.orcid.model.Employment;
|
import eu.dnetlib.dhp.collection.orcid.model.Employment;
|
||||||
import eu.dnetlib.dhp.collection.orcid.model.Work;
|
import eu.dnetlib.dhp.collection.orcid.model.Work;
|
||||||
|
import eu.dnetlib.dhp.common.DbClient;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.common.person.CoAuthorshipIterator;
|
||||||
|
import eu.dnetlib.dhp.common.person.Coauthors;
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Person;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||||
|
@ -44,7 +55,7 @@ import scala.Tuple2;
|
||||||
|
|
||||||
public class ExtractPerson implements Serializable {
|
public class ExtractPerson implements Serializable {
|
||||||
private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
|
private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
|
||||||
|
private static final String QUERY = "SELECT * FROM project_person WHERE pid_type = 'ORCID'";
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
private static final String OPENAIRE_PREFIX = "openaire____";
|
private static final String OPENAIRE_PREFIX = "openaire____";
|
||||||
private static final String SEPARATOR = "::";
|
private static final String SEPARATOR = "::";
|
||||||
|
@ -58,9 +69,48 @@ public class ExtractPerson implements Serializable {
|
||||||
|
|
||||||
private static final String PMCID_PREFIX = "50|pmcid_______::";
|
private static final String PMCID_PREFIX = "50|pmcid_______::";
|
||||||
private static final String ROR_PREFIX = "20|ror_________::";
|
private static final String ROR_PREFIX = "20|ror_________::";
|
||||||
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
|
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class)
|
||||||
|
+ IdentifierFactory.ID_PREFIX_SEPARATOR + ModelConstants.ORCID + "_______";
|
||||||
|
private static final String PROJECT_ID_PREFIX = ModelSupport.getIdPrefix(Project.class)
|
||||||
|
+ IdentifierFactory.ID_PREFIX_SEPARATOR;
|
||||||
|
|
||||||
public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
|
public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
|
||||||
public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
|
public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
|
||||||
|
public static final String FUNDER_AUTHORS_CLASSID = "sysimport:crosswalk:funderdatabase";
|
||||||
|
public static final String FUNDER_AUTHORS_CLASSNAME = "Imported from Funder Database";
|
||||||
|
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
|
||||||
|
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
|
||||||
|
|
||||||
|
public static List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
|
||||||
|
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||||
|
|
||||||
|
public static final DataInfo ORCIDDATAINFO = OafMapperUtils
|
||||||
|
.dataInfo(
|
||||||
|
false,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
ORCID_AUTHORS_CLASSID,
|
||||||
|
ORCID_AUTHORS_CLASSNAME,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
|
"0.91");
|
||||||
|
|
||||||
|
public static final DataInfo FUNDERDATAINFO = OafMapperUtils
|
||||||
|
.dataInfo(
|
||||||
|
false,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
FUNDER_AUTHORS_CLASSID,
|
||||||
|
FUNDER_AUTHORS_CLASSNAME,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
|
"0.91");
|
||||||
|
|
||||||
public static void main(final String[] args) throws IOException, ParseException {
|
public static void main(final String[] args) throws IOException, ParseException {
|
||||||
|
|
||||||
|
@ -91,19 +141,130 @@ public class ExtractPerson implements Serializable {
|
||||||
final String workingDir = parser.get("workingDir");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingDir {}", workingDir);
|
log.info("workingDir {}", workingDir);
|
||||||
|
|
||||||
|
final String dbUrl = parser.get("postgresUrl");
|
||||||
|
final String dbUser = parser.get("postgresUser");
|
||||||
|
final String dbPassword = parser.get("postgresPassword");
|
||||||
|
|
||||||
|
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
||||||
createActionSet(spark, inputPath, outputPath, workingDir);
|
extractInfoForActionSetFromORCID(spark, inputPath, workingDir);
|
||||||
|
extractInfoForActionSetFromProjects(
|
||||||
|
spark, inputPath, workingDir, dbUrl, dbUser, dbPassword, workingDir + "/project", hdfsNameNode);
|
||||||
|
createActionSet(spark, outputPath, workingDir);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) {
|
private static void extractInfoForActionSetFromProjects(SparkSession spark, String inputPath, String workingDir,
|
||||||
|
String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) throws IOException {
|
||||||
|
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set("fs.defaultFS", hdfsNameNode);
|
||||||
|
|
||||||
|
FileSystem fileSystem = FileSystem.get(conf);
|
||||||
|
Path hdfsWritePath = new Path(hdfsPath);
|
||||||
|
FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
|
||||||
|
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||||
|
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
|
||||||
|
dbClient.processResults(QUERY, rs -> writeRelation(getRelationWithProject(rs), writer));
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Relation getRelationWithProject(ResultSet rs) {
|
||||||
|
try {
|
||||||
|
return getProjectRelation(
|
||||||
|
rs.getString("project"), rs.getString("pid"),
|
||||||
|
rs.getString("role"));
|
||||||
|
} catch (final SQLException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Relation getProjectRelation(String project, String orcid, String role) {
|
||||||
|
|
||||||
|
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
|
||||||
|
String target = PROJECT_ID_PREFIX + StringUtils.substringBefore(project, "::") + "::"
|
||||||
|
+ IdentifierFactory.md5(StringUtils.substringAfter(project, "::"));
|
||||||
|
List<KeyValue> properties = new ArrayList<>();
|
||||||
|
|
||||||
|
Relation relation = OafMapperUtils
|
||||||
|
.getRelation(
|
||||||
|
source, target, ModelConstants.PROJECT_PERSON_RELTYPE, ModelConstants.PROJECT_PERSON_SUBRELTYPE,
|
||||||
|
ModelConstants.PROJECT_PERSON_PARTICIPATES,
|
||||||
|
collectedfromOpenAIRE,
|
||||||
|
FUNDERDATAINFO,
|
||||||
|
null);
|
||||||
|
relation.setValidated(true);
|
||||||
|
|
||||||
|
if (StringUtil.isNotBlank(role)) {
|
||||||
|
KeyValue kv = new KeyValue();
|
||||||
|
kv.setKey("role");
|
||||||
|
kv.setValue(role);
|
||||||
|
properties.add(kv);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!properties.isEmpty())
|
||||||
|
relation.setProperties(properties);
|
||||||
|
return relation;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static void writeRelation(final Relation relation, BufferedWriter writer) {
|
||||||
|
try {
|
||||||
|
writer.write(OBJECT_MAPPER.writeValueAsString(relation));
|
||||||
|
writer.newLine();
|
||||||
|
} catch (final IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void createActionSet(SparkSession spark, String outputPath, String workingDir) {
|
||||||
|
|
||||||
|
Dataset<Person> people;
|
||||||
|
people = spark
|
||||||
|
.read()
|
||||||
|
.textFile(workingDir + "/people")
|
||||||
|
.map(
|
||||||
|
(MapFunction<String, Person>) value -> OBJECT_MAPPER
|
||||||
|
.readValue(value, Person.class),
|
||||||
|
Encoders.bean(Person.class));
|
||||||
|
|
||||||
|
people
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(p -> new AtomicAction(p.getClass(), p))
|
||||||
|
.union(
|
||||||
|
getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
|
||||||
|
.union(
|
||||||
|
getRelations(spark, workingDir + "/coauthorship")
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(r -> new AtomicAction(r.getClass(), r)))
|
||||||
|
.union(
|
||||||
|
getRelations(spark, workingDir + "/affiliation")
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(r -> new AtomicAction(r.getClass(), r)))
|
||||||
|
.union(
|
||||||
|
getRelations(spark, workingDir + "/project")
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(r -> new AtomicAction(r.getClass(), r)))
|
||||||
|
.mapToPair(
|
||||||
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||||
|
.saveAsHadoopFile(
|
||||||
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void extractInfoForActionSetFromORCID(SparkSession spark, String inputPath, String workingDir) {
|
||||||
Dataset<Author> authors = spark
|
Dataset<Author> authors = spark
|
||||||
.read()
|
.read()
|
||||||
.parquet(inputPath + "Authors")
|
.parquet(inputPath + "Authors")
|
||||||
|
@ -129,18 +290,13 @@ public class ExtractPerson implements Serializable {
|
||||||
.parquet(inputPath + "Employments")
|
.parquet(inputPath + "Employments")
|
||||||
.as(Encoders.bean(Employment.class));
|
.as(Encoders.bean(Employment.class));
|
||||||
|
|
||||||
Dataset<Author> peopleToMap = authors
|
|
||||||
.joinWith(works, authors.col("orcid").equalTo(works.col("orcid")))
|
|
||||||
.map((MapFunction<Tuple2<Author, Work>, Author>) t2 -> t2._1(), Encoders.bean(Author.class))
|
|
||||||
.groupByKey((MapFunction<Author, String>) a -> a.getOrcid(), Encoders.STRING())
|
|
||||||
.mapGroups((MapGroupsFunction<String, Author, Author>) (k, it) -> it.next(), Encoders.bean(Author.class));
|
|
||||||
|
|
||||||
Dataset<Employment> employment = employmentDataset
|
Dataset<Employment> employment = employmentDataset
|
||||||
.joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid")))
|
.joinWith(authors, employmentDataset.col("orcid").equalTo(authors.col("orcid")))
|
||||||
.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));
|
.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));
|
||||||
|
|
||||||
Dataset<Person> people;
|
// Mapping all the orcid profiles even if the profile has no visible works
|
||||||
peopleToMap.map((MapFunction<Author, Person>) op -> {
|
|
||||||
|
authors.map((MapFunction<Author, Person>) op -> {
|
||||||
Person person = new Person();
|
Person person = new Person();
|
||||||
person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
|
person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
|
||||||
person
|
person
|
||||||
|
@ -190,9 +346,19 @@ public class ExtractPerson implements Serializable {
|
||||||
OafMapperUtils
|
OafMapperUtils
|
||||||
.structuredProperty(
|
.structuredProperty(
|
||||||
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
|
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
|
||||||
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
|
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES,
|
||||||
|
OafMapperUtils.dataInfo(false,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||||
|
ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES),
|
||||||
|
"0.91")));
|
||||||
person.setDateofcollection(op.getLastModifiedDate());
|
person.setDateofcollection(op.getLastModifiedDate());
|
||||||
person.setOriginalId(Arrays.asList(op.getOrcid()));
|
person.setOriginalId(Arrays.asList(op.getOrcid()));
|
||||||
|
person.setDataInfo(ORCIDDATAINFO);
|
||||||
return person;
|
return person;
|
||||||
}, Encoders.bean(Person.class))
|
}, Encoders.bean(Person.class))
|
||||||
.write()
|
.write()
|
||||||
|
@ -246,34 +412,6 @@ public class ExtractPerson implements Serializable {
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.json(workingDir + "/affiliation");
|
.json(workingDir + "/affiliation");
|
||||||
|
|
||||||
people = spark
|
|
||||||
.read()
|
|
||||||
.textFile(workingDir + "/people")
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, Person>) value -> OBJECT_MAPPER
|
|
||||||
.readValue(value, Person.class),
|
|
||||||
Encoders.bean(Person.class));
|
|
||||||
|
|
||||||
people.show(false);
|
|
||||||
people
|
|
||||||
.toJavaRDD()
|
|
||||||
.map(p -> new AtomicAction(p.getClass(), p))
|
|
||||||
.union(
|
|
||||||
getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
|
|
||||||
.union(
|
|
||||||
getRelations(spark, workingDir + "/coauthorship")
|
|
||||||
.toJavaRDD()
|
|
||||||
.map(r -> new AtomicAction(r.getClass(), r)))
|
|
||||||
.union(
|
|
||||||
getRelations(spark, workingDir + "/affiliation")
|
|
||||||
.toJavaRDD()
|
|
||||||
.map(r -> new AtomicAction(r.getClass(), r)))
|
|
||||||
.mapToPair(
|
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
|
||||||
.saveAsHadoopFile(
|
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Dataset<Relation> getRelations(SparkSession spark, String path) {
|
private static Dataset<Relation> getRelations(SparkSession spark, String path) {
|
||||||
|
@ -307,15 +445,9 @@ public class ExtractPerson implements Serializable {
|
||||||
source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
|
source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
|
||||||
ModelConstants.ORG_PERSON_PARTICIPATES,
|
ModelConstants.ORG_PERSON_PARTICIPATES,
|
||||||
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
||||||
OafMapperUtils
|
ORCIDDATAINFO,
|
||||||
.dataInfo(
|
|
||||||
false, null, false, false,
|
|
||||||
OafMapperUtils
|
|
||||||
.qualifier(
|
|
||||||
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
|
||||||
"0.91"),
|
|
||||||
null);
|
null);
|
||||||
|
relation.setValidated(true);
|
||||||
|
|
||||||
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
|
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
|
||||||
KeyValue kv = new KeyValue();
|
KeyValue kv = new KeyValue();
|
||||||
|
@ -336,45 +468,6 @@ public class ExtractPerson implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Collection<? extends Relation> getCoAuthorshipRelations(String orcid1, String orcid2) {
|
|
||||||
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1);
|
|
||||||
String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2);
|
|
||||||
|
|
||||||
return Arrays
|
|
||||||
.asList(
|
|
||||||
OafMapperUtils
|
|
||||||
.getRelation(
|
|
||||||
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
|
|
||||||
ModelConstants.PERSON_PERSON_SUBRELTYPE,
|
|
||||||
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
|
|
||||||
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
|
||||||
OafMapperUtils
|
|
||||||
.dataInfo(
|
|
||||||
false, null, false, false,
|
|
||||||
OafMapperUtils
|
|
||||||
.qualifier(
|
|
||||||
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
|
||||||
"0.91"),
|
|
||||||
null),
|
|
||||||
OafMapperUtils
|
|
||||||
.getRelation(
|
|
||||||
target, source, ModelConstants.PERSON_PERSON_RELTYPE,
|
|
||||||
ModelConstants.PERSON_PERSON_SUBRELTYPE,
|
|
||||||
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
|
|
||||||
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
|
||||||
OafMapperUtils
|
|
||||||
.dataInfo(
|
|
||||||
false, null, false, false,
|
|
||||||
OafMapperUtils
|
|
||||||
.qualifier(
|
|
||||||
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
|
||||||
"0.91"),
|
|
||||||
null));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {
|
private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {
|
||||||
|
|
||||||
if (Optional.ofNullable(w.getPids()).isPresent())
|
if (Optional.ofNullable(w.getPids()).isPresent())
|
||||||
|
@ -417,21 +510,15 @@ public class ExtractPerson implements Serializable {
|
||||||
default:
|
default:
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
Relation relation = OafMapperUtils
|
||||||
return OafMapperUtils
|
|
||||||
.getRelation(
|
.getRelation(
|
||||||
source, target, ModelConstants.RESULT_PERSON_RELTYPE,
|
source, target, ModelConstants.RESULT_PERSON_RELTYPE,
|
||||||
ModelConstants.RESULT_PERSON_SUBRELTYPE,
|
ModelConstants.RESULT_PERSON_SUBRELTYPE,
|
||||||
ModelConstants.RESULT_PERSON_HASAUTHORED,
|
ModelConstants.RESULT_PERSON_HASAUTHORED,
|
||||||
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
||||||
OafMapperUtils
|
ORCIDDATAINFO,
|
||||||
.dataInfo(
|
|
||||||
false, null, false, false,
|
|
||||||
OafMapperUtils
|
|
||||||
.qualifier(
|
|
||||||
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
|
||||||
"0.91"),
|
|
||||||
null);
|
null);
|
||||||
|
relation.setValidated(true);
|
||||||
|
return relation;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,9 +31,11 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
|
||||||
# The following is needed as a property of a workflow
|
# The following is needed as a property of a workflow
|
||||||
oozie.wf.application.path=${oozieTopWfApplicationPath}
|
oozie.wf.application.path=${oozieTopWfApplicationPath}
|
||||||
|
|
||||||
crossrefInputPath=/data/bip-affiliations/crossref-data.json
|
crossrefInputPath=/data/openaire-affiliations/crossref-data.json
|
||||||
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
pubmedInputPath=/data/openaire-affiliations/pubmed-data-v4.json
|
||||||
openapcInputPath=/data/bip-affiliations/openapc-data.json
|
openapcInputPath=/data/openaire-affiliations/openapc-data.json
|
||||||
dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
dataciteInputPath=/data/openaire-affiliations/datacite-data.json
|
||||||
|
webCrawlInputPath=/data/openaire-affiliations/webCrawl
|
||||||
|
publisherInputPath=/data/openaire-affiliations/publishers
|
||||||
|
|
||||||
outputPath=/tmp/crossref-affiliations-output-v5
|
outputPath=/tmp/affRoAS
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
<workflow-app name="BipAffiliations" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="OpenAIREAffiliations" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
|
@ -21,6 +21,10 @@
|
||||||
<name>webCrawlInputPath</name>
|
<name>webCrawlInputPath</name>
|
||||||
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>publisherInputPath</name>
|
||||||
|
<description>the path where to find the inferred affiliation relations from publisher websites</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>outputPath</name>
|
<name>outputPath</name>
|
||||||
<description>the path where to store the actionset</description>
|
<description>the path where to store the actionset</description>
|
||||||
|
@ -99,7 +103,7 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Produces the atomic action with the inferred by BIP! affiliation relations (from Crossref and Pubmed)</name>
|
<name>Produces the atomic action with the inferred by OpenAIRE affiliation relations</name>
|
||||||
<class>eu.dnetlib.dhp.actionmanager.bipaffiliations.PrepareAffiliationRelations</class>
|
<class>eu.dnetlib.dhp.actionmanager.bipaffiliations.PrepareAffiliationRelations</class>
|
||||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
|
@ -117,6 +121,7 @@
|
||||||
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
||||||
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
||||||
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
||||||
|
<arg>--publisherInputPath</arg><arg>${publisherInputPath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -21,5 +21,30 @@
|
||||||
"paramLongName": "workingDir",
|
"paramLongName": "workingDir",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "pu",
|
||||||
|
"paramLongName": "postgresUrl",
|
||||||
|
"paramDescription": "the hdfs name node",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "ps",
|
||||||
|
"paramLongName": "postgresUser",
|
||||||
|
"paramDescription": "the hdfs name node",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "pp",
|
||||||
|
"paramLongName": "postgresPassword",
|
||||||
|
"paramDescription": "the hdfs name node",
|
||||||
|
"paramRequired": false
|
||||||
|
},{
|
||||||
|
"paramName": "nn",
|
||||||
|
"paramLongName": "hdfsNameNode",
|
||||||
|
"paramDescription": "the hdfs name node",
|
||||||
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,2 +1,5 @@
|
||||||
inputPath=/data/orcid_2023/tables/
|
inputPath=/data/orcid_2023/tables/
|
||||||
outputPath=/user/miriam.baglioni/peopleAS
|
outputPath=/user/miriam.baglioni/peopleAS
|
||||||
|
postgresUrl=jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus
|
||||||
|
postgresUser=dnet
|
||||||
|
postgresPassword=dnetPwd
|
|
@ -9,6 +9,18 @@
|
||||||
<name>outputPath</name>
|
<name>outputPath</name>
|
||||||
<description>the path where to store the actionset</description>
|
<description>the path where to store the actionset</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>postgresUrl</name>
|
||||||
|
<description>the path where to store the actionset</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>postgresUser</name>
|
||||||
|
<description>the path where to store the actionset</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>postgresPassword</name>
|
||||||
|
<description>the path where to store the actionset</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
<description>memory for driver process</description>
|
<description>memory for driver process</description>
|
||||||
|
@ -102,6 +114,10 @@
|
||||||
<arg>--inputPath</arg><arg>${inputPath}</arg>
|
<arg>--inputPath</arg><arg>${inputPath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--postgresUrl</arg><arg>${postgresUrl}</arg>
|
||||||
|
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||||
|
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
|
|
||||||
<decision name="resume_from">
|
<decision name="resume_from">
|
||||||
<switch>
|
<switch>
|
||||||
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
<case to="reset_workingDir">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||||
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
@ -33,6 +33,14 @@
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
|
<action name="reset_workingDir">
|
||||||
|
<fs>
|
||||||
|
<delete path="${workingDir}"/>
|
||||||
|
<mkdir path="${workingDir}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="download"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
<action name="download">
|
<action name="download">
|
||||||
<shell xmlns="uri:oozie:shell-action:0.2">
|
<shell xmlns="uri:oozie:shell-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
|
|
@ -14,7 +14,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{
|
||||||
PidType
|
PidType
|
||||||
}
|
}
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import org.apache.commons.lang.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.apache.spark.sql.Row
|
import org.apache.spark.sql.Row
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
|
@ -673,11 +673,12 @@ case object Crossref2Oaf {
|
||||||
val doi = input.getString(0)
|
val doi = input.getString(0)
|
||||||
val rorId = input.getString(1)
|
val rorId = input.getString(1)
|
||||||
|
|
||||||
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}"
|
|
||||||
|
val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true)
|
||||||
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
|
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
|
||||||
|
|
||||||
val r: Relation = new Relation
|
val r: Relation = new Relation
|
||||||
DoiCleaningRule.clean(doi)
|
|
||||||
r.setSource(pubId)
|
r.setSource(pubId)
|
||||||
r.setTarget(affId)
|
r.setTarget(affId)
|
||||||
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
||||||
|
@ -978,7 +979,26 @@ case object Crossref2Oaf {
|
||||||
case "10.13039/501100010790" =>
|
case "10.13039/501100010790" =>
|
||||||
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
|
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
|
||||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
|
//Add for Danish funders
|
||||||
|
//Independent Research Fund Denmark (IRFD)
|
||||||
|
case "10.13039/501100004836" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "irfd________", a => a)
|
||||||
|
val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
//Carlsberg Foundation (CF)
|
||||||
|
case "10.13039/501100002808" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "cf__________", a => a)
|
||||||
|
val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
//Novo Nordisk Foundation (NNF)
|
||||||
|
case "10.13039/501100009708" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "nnf___________", a => a)
|
||||||
|
val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -98,9 +98,9 @@ public class PrepareAffiliationRelationsTest {
|
||||||
"-crossrefInputPath", crossrefAffiliationRelationPathNew,
|
"-crossrefInputPath", crossrefAffiliationRelationPathNew,
|
||||||
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
||||||
"-openapcInputPath", crossrefAffiliationRelationPathNew,
|
"-openapcInputPath", crossrefAffiliationRelationPathNew,
|
||||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
"-dataciteInputPath", crossrefAffiliationRelationPathNew,
|
||||||
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
"-webCrawlInputPath", crossrefAffiliationRelationPathNew,
|
||||||
"-publisherInputPath", publisherAffiliationRelationOldPath,
|
"-publisherInputPath", publisherAffiliationRelationPath,
|
||||||
"-outputPath", outputPath
|
"-outputPath", outputPath
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -112,7 +112,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
.map(aa -> ((Relation) aa.getPayload()));
|
.map(aa -> ((Relation) aa.getPayload()));
|
||||||
|
|
||||||
// count the number of relations
|
// count the number of relations
|
||||||
assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 =
|
assertEquals(162, tmp.count());// 18 + 24 + 30 * 4 =
|
||||||
|
|
||||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||||
dataset.createOrReplaceTempView("result");
|
dataset.createOrReplaceTempView("result");
|
||||||
|
@ -123,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
// verify that we have equal number of bi-directional relations
|
// verify that we have equal number of bi-directional relations
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
75, execVerification
|
81, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
@ -131,7 +131,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
75, execVerification
|
81, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
@ -158,7 +158,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
|
4, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
|
@ -173,7 +173,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
3, execVerification
|
1, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"source = '" + ID_PREFIX
|
"source = '" + ID_PREFIX
|
||||||
+ IdentifierFactory
|
+ IdentifierFactory
|
||||||
|
|
|
@ -63,6 +63,7 @@
|
||||||
<path start="copy_software"/>
|
<path start="copy_software"/>
|
||||||
<path start="copy_datasource"/>
|
<path start="copy_datasource"/>
|
||||||
<path start="copy_project"/>
|
<path start="copy_project"/>
|
||||||
|
<path start="copy_person"/>
|
||||||
<path start="copy_organization"/>
|
<path start="copy_organization"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
|
@ -120,6 +121,15 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_person">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<arg>${nameNode}/${sourcePath}/person</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/person</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<action name="copy_datasource">
|
<action name="copy_datasource">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
|
|
|
@ -2,14 +2,13 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.beanutils.BeanUtils;
|
import org.apache.commons.beanutils.BeanUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
|
||||||
import org.apache.spark.api.java.function.FlatMapGroupsFunction;
|
import org.apache.spark.api.java.function.FlatMapGroupsFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.ReduceFunction;
|
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||||
|
@ -107,6 +106,8 @@ public class DedupRecordFactory {
|
||||||
|
|
||||||
final HashSet<String> acceptanceDate = new HashSet<>();
|
final HashSet<String> acceptanceDate = new HashSet<>();
|
||||||
|
|
||||||
|
boolean isVisible = false;
|
||||||
|
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
Tuple3<String, String, OafEntity> t = it.next();
|
Tuple3<String, String, OafEntity> t = it.next();
|
||||||
OafEntity entity = t._3();
|
OafEntity entity = t._3();
|
||||||
|
@ -114,6 +115,7 @@ public class DedupRecordFactory {
|
||||||
if (entity == null) {
|
if (entity == null) {
|
||||||
aliases.add(t._2());
|
aliases.add(t._2());
|
||||||
} else {
|
} else {
|
||||||
|
isVisible = isVisible || !entity.getDataInfo().getInvisible();
|
||||||
cliques.add(entity);
|
cliques.add(entity);
|
||||||
|
|
||||||
if (acceptanceDate.size() < MAX_ACCEPTANCE_DATE) {
|
if (acceptanceDate.size() < MAX_ACCEPTANCE_DATE) {
|
||||||
|
@ -129,13 +131,20 @@ public class DedupRecordFactory {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) {
|
if (!isVisible || acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) {
|
||||||
return Collections.emptyIterator();
|
return Collections.emptyIterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator());
|
OafEntity mergedEntity = MergeUtils.mergeGroup(cliques.iterator());
|
||||||
// dedup records do not have date of transformation attribute
|
// dedup records do not have date of transformation attribute
|
||||||
mergedEntity.setDateoftransformation(null);
|
mergedEntity.setDateoftransformation(null);
|
||||||
|
mergedEntity
|
||||||
|
.setMergedIds(
|
||||||
|
Stream
|
||||||
|
.concat(cliques.stream().map(OafEntity::getId), aliases.stream())
|
||||||
|
.distinct()
|
||||||
|
.sorted()
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
return Stream
|
return Stream
|
||||||
.concat(
|
.concat(
|
||||||
|
|
|
@ -91,7 +91,6 @@ public class SparkBlockStats extends AbstractSparkAction {
|
||||||
.read()
|
.read()
|
||||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||||
.transform(deduper.model().parseJsonDataset())
|
.transform(deduper.model().parseJsonDataset())
|
||||||
.transform(deduper.filterAndCleanup())
|
|
||||||
.transform(deduper.generateClustersWithCollect())
|
.transform(deduper.generateClustersWithCollect())
|
||||||
.filter(functions.size(new Column("block")).geq(1));
|
.filter(functions.size(new Column("block")).geq(1));
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,11 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTION
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -17,6 +17,7 @@ import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
@ -25,6 +26,8 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import scala.collection.JavaConversions;
|
||||||
|
import scala.collection.JavaConverters;
|
||||||
|
|
||||||
public class SparkCreateDedupRecord extends AbstractSparkAction {
|
public class SparkCreateDedupRecord extends AbstractSparkAction {
|
||||||
|
|
||||||
|
@ -85,6 +88,36 @@ public class SparkCreateDedupRecord extends AbstractSparkAction {
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
|
|
||||||
|
log.info("Updating mergerels for: '{}'", subEntity);
|
||||||
|
final Dataset<Row> dedupIds = spark
|
||||||
|
.read()
|
||||||
|
.schema("`id` STRING, `mergedIds` ARRAY<STRING>")
|
||||||
|
.json(outputPath)
|
||||||
|
.selectExpr("id as source", "explode(mergedIds) as target");
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.load(mergeRelPath)
|
||||||
|
.where("relClass == 'merges'")
|
||||||
|
.join(dedupIds, JavaConversions.asScalaBuffer(Arrays.asList("source", "target")), "left_semi")
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.save(workingPath + "/mergerel_filtered");
|
||||||
|
|
||||||
|
final Dataset<Row> validRels = spark.read().load(workingPath + "/mergerel_filtered");
|
||||||
|
|
||||||
|
final Dataset<Row> filteredMergeRels = validRels
|
||||||
|
.union(
|
||||||
|
validRels
|
||||||
|
.withColumnRenamed("source", "source_tmp")
|
||||||
|
.withColumnRenamed("target", "target_tmp")
|
||||||
|
.withColumn("relClass", functions.lit(ModelConstants.IS_MERGED_IN))
|
||||||
|
.withColumnRenamed("target_tmp", "source")
|
||||||
|
.withColumnRenamed("source_tmp", "target"));
|
||||||
|
|
||||||
|
saveParquet(filteredMergeRels, mergeRelPath, SaveMode.Overwrite);
|
||||||
|
removeOutputDir(spark, workingPath + "/mergerel_filtered");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -69,6 +69,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
|
|
||||||
Dataset<Relation> mergeRels = spark
|
Dataset<Relation> mergeRels = spark
|
||||||
.read()
|
.read()
|
||||||
|
.schema(REL_BEAN_ENC.schema())
|
||||||
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
|
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
|
||||||
.as(REL_BEAN_ENC);
|
.as(REL_BEAN_ENC);
|
||||||
|
|
||||||
|
|
|
@ -46,8 +46,8 @@ class DatasetMergerTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
|
void datasetMergerTest() {
|
||||||
Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
|
Dataset pub_merged = MergeUtils.mergeGroup(datasets.stream().map(Tuple2::_2).iterator());
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals(dedupId, pub_merged.getId());
|
assertEquals(dedupId, pub_merged.getId());
|
||||||
|
|
|
@ -96,7 +96,7 @@
|
||||||
"aggregation": "MAX",
|
"aggregation": "MAX",
|
||||||
"positive": "layer4",
|
"positive": "layer4",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "MATCH",
|
"undefined": "layer4",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"layer4": {
|
"layer4": {
|
||||||
|
|
|
@ -7,7 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactor
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||||
import org.apache.commons.lang.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST._
|
import org.json4s.JsonAST._
|
||||||
|
@ -560,9 +560,32 @@ case object Crossref2Oaf {
|
||||||
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
|
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
|
||||||
"10.13039/501100013589" | "10.13039/501100000271" =>
|
"10.13039/501100013589" | "10.13039/501100000271" =>
|
||||||
generateSimpleRelationFromAward(funder, "ukri________", a => a)
|
generateSimpleRelationFromAward(funder, "ukri________", a => a)
|
||||||
|
//DFG
|
||||||
|
case "10.13039/501100001659" =>
|
||||||
|
val targetId = getProjectId("dfgf________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
|
//Add for Danish funders
|
||||||
|
//Independent Research Fund Denmark (IRFD)
|
||||||
|
case "10.13039/501100004836" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "irfd________", a => a)
|
||||||
|
val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
//Carlsberg Foundation (CF)
|
||||||
|
case "10.13039/501100002808" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "cf__________", a => a)
|
||||||
|
val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
//Novo Nordisk Foundation (NNF)
|
||||||
|
case "10.13039/501100009708" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "nnf___________", a => a)
|
||||||
|
val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
|
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
|
||||||
import org.apache.commons.lang.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST._
|
import org.json4s.JsonAST._
|
||||||
|
|
|
@ -48,12 +48,7 @@
|
||||||
<groupId>io.github.classgraph</groupId>
|
<groupId>io.github.classgraph</groupId>
|
||||||
<artifactId>classgraph</artifactId>
|
<artifactId>classgraph</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
|
||||||
<artifactId>dhp-aggregation</artifactId>
|
|
||||||
<version>1.2.5-SNAPSHOT</version>
|
|
||||||
<scope>compile</scope>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
|
@ -6,11 +6,11 @@ import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.amazonaws.util.StringUtils;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
|
@ -81,7 +81,7 @@ public class Utils implements Serializable {
|
||||||
Community c = new Community();
|
Community c = new Community();
|
||||||
c.setId(cm.getId());
|
c.setId(cm.getId());
|
||||||
c.setZenodoCommunities(cm.getOtherZenodoCommunities());
|
c.setZenodoCommunities(cm.getOtherZenodoCommunities());
|
||||||
if (!StringUtils.isNullOrEmpty(cm.getZenodoCommunity()))
|
if (StringUtils.isNotBlank(cm.getZenodoCommunity()))
|
||||||
c.getZenodoCommunities().add(cm.getZenodoCommunity());
|
c.getZenodoCommunities().add(cm.getZenodoCommunity());
|
||||||
c.setSubjects(cm.getSubjects());
|
c.setSubjects(cm.getSubjects());
|
||||||
c.getSubjects().addAll(cm.getFos());
|
c.getSubjects().addAll(cm.getFos());
|
||||||
|
|
|
@ -13,13 +13,13 @@ public class CommunityContentprovider {
|
||||||
private String openaireId;
|
private String openaireId;
|
||||||
private SelectionConstraints selectioncriteria;
|
private SelectionConstraints selectioncriteria;
|
||||||
|
|
||||||
private String enabled;
|
private Boolean enabled;
|
||||||
|
|
||||||
public String getEnabled() {
|
public Boolean getEnabled() {
|
||||||
return enabled;
|
return enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setEnabled(String enabled) {
|
public void setEnabled(Boolean enabled) {
|
||||||
this.enabled = enabled;
|
this.enabled = enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ package eu.dnetlib.dhp.bulktag.community;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.lang.reflect.InvocationTargetException;
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
|
||||||
import org.apache.htrace.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.bulktag.criteria.Selection;
|
import eu.dnetlib.dhp.bulktag.criteria.Selection;
|
||||||
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
|
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
|
||||||
|
|
|
@ -0,0 +1,302 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.person;
|
||||||
|
|
||||||
|
import static com.ibm.icu.text.PluralRules.Operand.w;
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.person.CoAuthorshipIterator;
|
||||||
|
import eu.dnetlib.dhp.common.person.Coauthors;
|
||||||
|
import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkExtractPersonRelations {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class);
|
||||||
|
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
|
||||||
|
|
||||||
|
public static final DataInfo DATAINFO = OafMapperUtils
|
||||||
|
.dataInfo(
|
||||||
|
false,
|
||||||
|
"openaire",
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY,
|
||||||
|
ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
|
"0.85");
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkCountryPropagationJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String sourcePath = parser.get("sourcePath");
|
||||||
|
log.info("sourcePath: {}", sourcePath);
|
||||||
|
|
||||||
|
final String workingPath = parser.get("outputPath");
|
||||||
|
log.info("workingPath: {}", workingPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
|
||||||
|
extractRelations(
|
||||||
|
spark,
|
||||||
|
sourcePath,
|
||||||
|
workingPath);
|
||||||
|
removeIsolatedPerson(spark, sourcePath, workingPath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void removeIsolatedPerson(SparkSession spark, String sourcePath, String workingPath) {
|
||||||
|
Dataset<Person> personDataset = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Person.class).schema())
|
||||||
|
.json(sourcePath + "person")
|
||||||
|
.as(Encoders.bean(Person.class));
|
||||||
|
|
||||||
|
Dataset<Relation> relationDataset = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(sourcePath + "relation")
|
||||||
|
.as(Encoders.bean(Relation.class));
|
||||||
|
|
||||||
|
personDataset
|
||||||
|
.join(relationDataset, personDataset.col("id").equalTo(relationDataset.col("source")), "left_semi")
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(workingPath + "person");
|
||||||
|
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Person.class).schema())
|
||||||
|
.json(workingPath + "person")
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(sourcePath + "person");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void extractRelations(SparkSession spark, String sourcePath, String workingPath) {
|
||||||
|
|
||||||
|
Dataset<Tuple2<String, Relation>> relationDataset = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(sourcePath + "relation")
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.map(
|
||||||
|
(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
|
||||||
|
r.getSource() + r.getRelClass() + r.getTarget(), r),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
|
||||||
|
|
||||||
|
ModelSupport.entityTypes
|
||||||
|
.keySet()
|
||||||
|
.stream()
|
||||||
|
.filter(ModelSupport::isResult)
|
||||||
|
.forEach(
|
||||||
|
e -> {
|
||||||
|
// 1. search for results having orcid_pending and orcid in the set of pids for the authors
|
||||||
|
Dataset<Result> resultWithOrcids = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Result.class).schema())
|
||||||
|
.json(sourcePath + e.name())
|
||||||
|
.as(Encoders.bean(Result.class))
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Result>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||||
|
!r.getDataInfo().getInvisible() &&
|
||||||
|
Optional
|
||||||
|
.ofNullable(r.getAuthor())
|
||||||
|
.isPresent())
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Result>) r -> r
|
||||||
|
.getAuthor()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
a -> Optional
|
||||||
|
.ofNullable(
|
||||||
|
a
|
||||||
|
.getPid())
|
||||||
|
.isPresent() &&
|
||||||
|
a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
p -> Arrays
|
||||||
|
.asList("orcid", "orcid_pending")
|
||||||
|
.contains(p.getQualifier().getClassid().toLowerCase()))));
|
||||||
|
// 2. create authorship relations between the result identifier and the person entity with
|
||||||
|
// orcid_pending.
|
||||||
|
Dataset<Tuple2<String, Relation>> newRelations = resultWithOrcids
|
||||||
|
.flatMap(
|
||||||
|
(FlatMapFunction<Result, Relation>) r -> getAuthorshipRelations(r),
|
||||||
|
Encoders.bean(Relation.class))
|
||||||
|
// .groupByKey((MapFunction<Relation, String>) r-> r.getSource()+r.getTarget(), Encoders.STRING() )
|
||||||
|
// .mapGroups((MapGroupsFunction<String, Relation, Relation>) (k,it) -> it.next(), Encoders.bean(Relation.class) )
|
||||||
|
.map(
|
||||||
|
(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
|
||||||
|
r.getSource() + r.getRelClass() + r.getTarget(), r),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
|
||||||
|
newRelations
|
||||||
|
.joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left")
|
||||||
|
.map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, Relation>>, Relation>) t2 -> {
|
||||||
|
if (t2._2() == null)
|
||||||
|
return t2._1()._2();
|
||||||
|
return null;
|
||||||
|
}, Encoders.bean(Relation.class))
|
||||||
|
.filter((FilterFunction<Relation>) r -> r != null)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingPath);
|
||||||
|
|
||||||
|
// 2.1 store in a separate location the relation between the person and the pids for the result?
|
||||||
|
|
||||||
|
// 3. create co_authorship relations between the pairs of authors with orcid/orcid_pending pids
|
||||||
|
newRelations = resultWithOrcids
|
||||||
|
.map((MapFunction<Result, Coauthors>) r -> getAuthorsPidList(r), Encoders.bean(Coauthors.class))
|
||||||
|
.flatMap(
|
||||||
|
(FlatMapFunction<Coauthors, Relation>) c -> new CoAuthorshipIterator(c.getCoauthors()),
|
||||||
|
Encoders.bean(Relation.class))
|
||||||
|
.groupByKey(
|
||||||
|
(MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
|
||||||
|
.mapGroups(
|
||||||
|
(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(),
|
||||||
|
Encoders.bean(Relation.class))
|
||||||
|
.map(
|
||||||
|
(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
|
||||||
|
r.getSource() + r.getRelClass() + r.getTarget(), r),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
|
||||||
|
newRelations
|
||||||
|
.joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left")
|
||||||
|
.map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, Relation>>, Relation>) t2 -> {
|
||||||
|
if (t2._2() == null)
|
||||||
|
return t2._1()._2();
|
||||||
|
return null;
|
||||||
|
}, Encoders.bean(Relation.class))
|
||||||
|
.filter((FilterFunction<Relation>) r -> r != null)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingPath);
|
||||||
|
|
||||||
|
});
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(workingPath)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(sourcePath + "relation");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Coauthors getAuthorsPidList(Result r) {
|
||||||
|
Coauthors coauth = new Coauthors();
|
||||||
|
coauth
|
||||||
|
.setCoauthors(
|
||||||
|
r
|
||||||
|
.getAuthor()
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
a -> a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
p -> Arrays.asList("orcid", "orcid_pending").contains(p.getQualifier().getClassid())))
|
||||||
|
.map(a -> {
|
||||||
|
Optional<StructuredProperty> tmp = a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid"))
|
||||||
|
.findFirst();
|
||||||
|
if (tmp.isPresent())
|
||||||
|
return tmp.get().getValue();
|
||||||
|
tmp = a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending"))
|
||||||
|
.findFirst();
|
||||||
|
if (tmp.isPresent())
|
||||||
|
return tmp.get().getValue();
|
||||||
|
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
return coauth;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Iterator<Relation> getAuthorshipRelations(Result r) {
|
||||||
|
List<Relation> relationList = new ArrayList<>();
|
||||||
|
for (Author a : r.getAuthor())
|
||||||
|
|
||||||
|
relationList.addAll(a.getPid().stream().map(p -> {
|
||||||
|
|
||||||
|
if (p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending"))
|
||||||
|
return getRelation(p.getValue(), r.getId());
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
return relationList.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Relation getRelation(String orcid, String resultId) {
|
||||||
|
|
||||||
|
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
|
||||||
|
|
||||||
|
Relation relation = OafMapperUtils
|
||||||
|
.getRelation(
|
||||||
|
source, resultId, ModelConstants.RESULT_PERSON_RELTYPE,
|
||||||
|
ModelConstants.RESULT_PERSON_SUBRELTYPE,
|
||||||
|
ModelConstants.RESULT_PERSON_HASAUTHORED,
|
||||||
|
null, // collectedfrom = null
|
||||||
|
DATAINFO,
|
||||||
|
null);
|
||||||
|
|
||||||
|
return relation;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -7,4 +7,5 @@ community_organization classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunit
|
||||||
result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app
|
result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app
|
||||||
community_project classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app
|
community_project classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app
|
||||||
community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app
|
community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app
|
||||||
country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app
|
country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app
|
||||||
|
person_propagation classpath eu/dnetlib/dhp/wf/subworkflows/person/oozie_app
|
|
@ -122,6 +122,7 @@
|
||||||
<case to="community_project">${wf:conf('resumeFrom') eq 'CommunityProject'}</case>
|
<case to="community_project">${wf:conf('resumeFrom') eq 'CommunityProject'}</case>
|
||||||
<case to="community_sem_rel">${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'}</case>
|
<case to="community_sem_rel">${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'}</case>
|
||||||
<case to="country_propagation">${wf:conf('resumeFrom') eq 'CountryPropagation'}</case>
|
<case to="country_propagation">${wf:conf('resumeFrom') eq 'CountryPropagation'}</case>
|
||||||
|
<case to="person_propagation">${wf:conf('resumeFrom') eq 'PersonPropagation'}</case>
|
||||||
<default to="orcid_propagation"/>
|
<default to="orcid_propagation"/>
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
@ -291,10 +292,24 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
|
<ok to="person_propagation" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
<action name="person_propagation">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/person_propagation
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
<ok to="country_propagation" />
|
<ok to="country_propagation" />
|
||||||
<error to="Kill" />
|
<error to="Kill" />
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="country_propagation">
|
<action name="country_propagation">
|
||||||
<sub-workflow>
|
<sub-workflow>
|
||||||
<app-path>${wf:appPath()}/country_propagation
|
<app-path>${wf:appPath()}/country_propagation
|
||||||
|
@ -319,6 +334,8 @@
|
||||||
<error to="Kill" />
|
<error to="Kill" />
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
||||||
|
|
|
@ -34,6 +34,7 @@
|
||||||
<path start="copy_organization"/>
|
<path start="copy_organization"/>
|
||||||
<path start="copy_projects"/>
|
<path start="copy_projects"/>
|
||||||
<path start="copy_datasources"/>
|
<path start="copy_datasources"/>
|
||||||
|
<path start="copy_persons"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
|
@ -80,6 +81,17 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_persons">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/person</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/person</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
|
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
|
||||||
|
|
||||||
<fork name="fork_prepare_assoc_step1">
|
<fork name="fork_prepare_assoc_step1">
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path used to store temporary output files",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1 @@
|
||||||
|
sourcePath=/tmp/miriam/13_graph_copy
|
|
@ -0,0 +1,58 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_metastore_uris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorNumber</name>
|
||||||
|
<value>4</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<value>15G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<value>5G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<value>4</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2MaxExecutors</name>
|
||||||
|
<value>50</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,68 @@
|
||||||
|
<workflow-app name="person_propagation" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="reset_outputpath">
|
||||||
|
<fs>
|
||||||
|
<delete path="${workingDir}"/>
|
||||||
|
<mkdir path="${workingDir}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="extract_person_relation_from_graph"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="extract_person_relation_from_graph">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>personPropagation</name>
|
||||||
|
<class>eu.dnetlib.dhp.person.SparkExtractPersonRelations</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,93 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.person;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.junit.jupiter.api.AfterAll;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class PersonPropagationJobTest {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PersonPropagationJobTest.class);
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
private static Path workingDir;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void beforeAll() throws IOException {
|
||||||
|
workingDir = Files.createTempDirectory(PersonPropagationJobTest.class.getSimpleName());
|
||||||
|
log.info("using work dir {}", workingDir);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setAppName(PersonPropagationJobTest.class.getSimpleName());
|
||||||
|
|
||||||
|
conf.setMaster("local[*]");
|
||||||
|
conf.set("spark.driver.host", "localhost");
|
||||||
|
conf.set("hive.metastore.local", "true");
|
||||||
|
conf.set("spark.ui.enabled", "false");
|
||||||
|
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||||
|
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||||
|
|
||||||
|
spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(PersonPropagationJobTest.class.getSimpleName())
|
||||||
|
.config(conf)
|
||||||
|
.getOrCreate();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void afterAll() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
|
spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testPersonPropagation() throws Exception {
|
||||||
|
final String sourcePath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/personpropagation/graph")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
SparkExtractPersonRelations
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"--sourcePath", sourcePath,
|
||||||
|
"--outputPath", workingDir.toString()
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Relation> tmp = sc
|
||||||
|
.textFile(workingDir.toString() + "/relation")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
|
||||||
|
|
||||||
|
// TODO write assertions and find relevant information for hte resource files
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -72,9 +72,9 @@ public class GraphHiveTableImporterJob {
|
||||||
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
|
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
|
||||||
|
|
||||||
Dataset<Row> dataset = spark
|
Dataset<Row> dataset = spark
|
||||||
.read()
|
.read()
|
||||||
.schema(clazzEncoder.schema())
|
.schema(clazzEncoder.schema())
|
||||||
.json(inputPath);
|
.json(inputPath);
|
||||||
|
|
||||||
if (numPartitions > 0) {
|
if (numPartitions > 0) {
|
||||||
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
||||||
|
|
|
@ -153,34 +153,40 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final DataInfo entityInfo = prepareDataInfo(doc, this.invisible);
|
final DataInfo entityInfo = prepareDataInfo(doc, this.invisible);
|
||||||
final long lastUpdateTimestamp = new Date().getTime();
|
final long lastUpdateTimestamp = new Date().getTime();
|
||||||
|
|
||||||
final List<Instance> instances = prepareInstances(doc, entityInfo, collectedFrom, hostedBy);
|
final Instance instance = prepareInstances(doc, entityInfo, collectedFrom, hostedBy);
|
||||||
|
|
||||||
final String type = getResultType(doc, instances);
|
if (!Optional
|
||||||
|
.ofNullable(instance.getInstancetype())
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.isPresent()) {
|
||||||
|
return Lists.newArrayList();
|
||||||
|
}
|
||||||
|
|
||||||
return createOafs(doc, type, instances, collectedFrom, entityInfo, lastUpdateTimestamp);
|
final String type = getResultType(instance);
|
||||||
|
|
||||||
|
return createOafs(doc, type, instance, collectedFrom, entityInfo, lastUpdateTimestamp);
|
||||||
} catch (final DocumentException e) {
|
} catch (final DocumentException e) {
|
||||||
log.error("Error with record:\n" + xml);
|
log.error("Error with record:\n" + xml);
|
||||||
return Lists.newArrayList();
|
return Lists.newArrayList();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getResultType(final Document doc, final List<Instance> instances) {
|
protected String getResultType(final Instance instance) {
|
||||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
if (this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
||||||
|
|
||||||
if (StringUtils.isBlank(type) && this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
|
||||||
final String instanceType = instances
|
|
||||||
.stream()
|
|
||||||
.map(i -> i.getInstancetype().getClassid())
|
|
||||||
.findFirst()
|
|
||||||
.filter(s -> !UNKNOWN.equalsIgnoreCase(s))
|
|
||||||
.orElse("0000"); // Unknown
|
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
|
.ofNullable(instance.getInstancetype())
|
||||||
.map(Qualifier::getClassid)
|
.map(Qualifier::getClassid)
|
||||||
|
.map(
|
||||||
|
instanceType -> Optional
|
||||||
|
.ofNullable(
|
||||||
|
this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.orElse("0000"))
|
||||||
.orElse("0000");
|
.orElse("0000");
|
||||||
|
} else {
|
||||||
|
throw new IllegalStateException("Missing vocabulary: " + ModelConstants.DNET_RESULT_TYPOLOGIES);
|
||||||
}
|
}
|
||||||
|
|
||||||
return type;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
|
private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
|
||||||
|
@ -197,12 +203,12 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
protected List<Oaf> createOafs(
|
protected List<Oaf> createOafs(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final String type,
|
final String type,
|
||||||
final List<Instance> instances,
|
final Instance instance,
|
||||||
final KeyValue collectedFrom,
|
final KeyValue collectedFrom,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final long lastUpdateTimestamp) {
|
final long lastUpdateTimestamp) {
|
||||||
|
|
||||||
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
final OafEntity entity = createEntity(doc, type, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
|
|
||||||
final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
|
final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
|
||||||
originalId.add(entity.getId());
|
originalId.add(entity.getId());
|
||||||
|
@ -235,19 +241,19 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
private OafEntity createEntity(final Document doc,
|
private OafEntity createEntity(final Document doc,
|
||||||
final String type,
|
final String type,
|
||||||
final List<Instance> instances,
|
final Instance instance,
|
||||||
final KeyValue collectedFrom,
|
final KeyValue collectedFrom,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final long lastUpdateTimestamp) {
|
final long lastUpdateTimestamp) {
|
||||||
switch (type.toLowerCase()) {
|
switch (type.toLowerCase()) {
|
||||||
case "publication":
|
case "publication":
|
||||||
final Publication p = new Publication();
|
final Publication p = new Publication();
|
||||||
populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(p, doc, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
p.setJournal(prepareJournal(doc, info));
|
p.setJournal(prepareJournal(doc, info));
|
||||||
return p;
|
return p;
|
||||||
case "dataset":
|
case "dataset":
|
||||||
final Dataset d = new Dataset();
|
final Dataset d = new Dataset();
|
||||||
populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(d, doc, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
||||||
d.setDevice(prepareDatasetDevice(doc, info));
|
d.setDevice(prepareDatasetDevice(doc, info));
|
||||||
d.setSize(prepareDatasetSize(doc, info));
|
d.setSize(prepareDatasetSize(doc, info));
|
||||||
|
@ -258,7 +264,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
return d;
|
return d;
|
||||||
case "software":
|
case "software":
|
||||||
final Software s = new Software();
|
final Software s = new Software();
|
||||||
populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(s, doc, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
||||||
s.setLicense(prepareSoftwareLicenses(doc, info));
|
s.setLicense(prepareSoftwareLicenses(doc, info));
|
||||||
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
|
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
|
||||||
|
@ -268,7 +274,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
case "otherresearchproducts":
|
case "otherresearchproducts":
|
||||||
default:
|
default:
|
||||||
final OtherResearchProduct o = new OtherResearchProduct();
|
final OtherResearchProduct o = new OtherResearchProduct();
|
||||||
populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(o, doc, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
||||||
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
||||||
o.setTool(prepareOtherResearchProductTools(doc, info));
|
o.setTool(prepareOtherResearchProductTools(doc, info));
|
||||||
|
@ -415,7 +421,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
private void populateResultFields(
|
private void populateResultFields(
|
||||||
final Result r,
|
final Result r,
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final List<Instance> instances,
|
final Instance instance,
|
||||||
final KeyValue collectedFrom,
|
final KeyValue collectedFrom,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final long lastUpdateTimestamp) {
|
final long lastUpdateTimestamp) {
|
||||||
|
@ -449,8 +455,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
r.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
r.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||||
r.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
r.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||||
r.setInstance(instances);
|
r.setInstance(Arrays.asList(instance));
|
||||||
r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));
|
r.setBestaccessright(OafMapperUtils.createBestAccessRights(Arrays.asList(instance)));
|
||||||
r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info));
|
r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -509,7 +515,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
||||||
|
|
||||||
protected abstract List<Instance> prepareInstances(
|
protected abstract Instance prepareInstances(
|
||||||
Document doc,
|
Document doc,
|
||||||
DataInfo info,
|
DataInfo info,
|
||||||
KeyValue collectedfrom,
|
KeyValue collectedfrom,
|
||||||
|
@ -657,13 +663,21 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final Node n = (Node) o;
|
final Node n = (Node) o;
|
||||||
final String classId = n.valueOf(xpathClassId).trim();
|
final String classId = n.valueOf(xpathClassId).trim();
|
||||||
if (this.vocs.termExists(schemeId, classId)) {
|
if (this.vocs.termExists(schemeId, classId)) {
|
||||||
res
|
final String value = n.getText();
|
||||||
.add(
|
if (StringUtils.isNotBlank(value)) {
|
||||||
HashableStructuredProperty
|
res
|
||||||
.newInstance(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info));
|
.add(
|
||||||
|
HashableStructuredProperty
|
||||||
|
.newInstance(value, this.vocs.getTermAsQualifier(schemeId, classId), info));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Lists.newArrayList(res);
|
return res
|
||||||
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||||
|
.filter(p -> StringUtils.isNotBlank(p.getValue().trim()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
protected List<StructuredProperty> prepareListStructProps(
|
protected List<StructuredProperty> prepareListStructProps(
|
||||||
|
|
|
@ -133,7 +133,7 @@ public class GenerateEntitiesApplication extends AbstractMigrationApplication {
|
||||||
inputRdd
|
inputRdd
|
||||||
.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
|
.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
|
||||||
.groupByKey()
|
.groupByKey()
|
||||||
.map(t -> MergeUtils.mergeGroup(t._1, t._2.iterator())),
|
.map(t -> MergeUtils.mergeGroup(t._2.iterator())),
|
||||||
// .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
|
// .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
|
||||||
// .reduceByKey(MergeUtils::merge)
|
// .reduceByKey(MergeUtils::merge)
|
||||||
// .map(Tuple2::_2),
|
// .map(Tuple2::_2),
|
||||||
|
|
|
@ -519,6 +519,28 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
||||||
r1 = setRelationSemantic(r1, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
|
r1 = setRelationSemantic(r1, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
|
||||||
r2 = setRelationSemantic(r2, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
|
r2 = setRelationSemantic(r2, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
|
||||||
break;
|
break;
|
||||||
|
case "resultOrganization_affiliation_isAuthorInstitutionOf":
|
||||||
|
if (!"organization".equals(sourceType)) {
|
||||||
|
throw new IllegalStateException(
|
||||||
|
String
|
||||||
|
.format(
|
||||||
|
"invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
|
||||||
|
semantics));
|
||||||
|
}
|
||||||
|
r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF);
|
||||||
|
r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION);
|
||||||
|
break;
|
||||||
|
case "resultOrganization_affiliation_hasAuthorInstitution":
|
||||||
|
if (!"organization".equals(targetType)) {
|
||||||
|
throw new IllegalStateException(
|
||||||
|
String
|
||||||
|
.format(
|
||||||
|
"invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
|
||||||
|
semantics));
|
||||||
|
}
|
||||||
|
r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION);
|
||||||
|
r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException("claim semantics not managed: " + semantics);
|
throw new IllegalArgumentException("claim semantics not managed: " + semantics);
|
||||||
}
|
}
|
||||||
|
|
|
@ -135,7 +135,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Instance> prepareInstances(
|
protected Instance prepareInstances(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final KeyValue collectedfrom,
|
final KeyValue collectedfrom,
|
||||||
|
@ -197,7 +197,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
instance.getUrl().addAll(validUrl);
|
instance.getUrl().addAll(validUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Lists.newArrayList(instance);
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -126,7 +126,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Instance> prepareInstances(
|
protected Instance prepareInstances(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final KeyValue collectedfrom,
|
final KeyValue collectedfrom,
|
||||||
|
@ -210,7 +210,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
instance.setUrl(new ArrayList<>());
|
instance.setUrl(new ArrayList<>());
|
||||||
instance.getUrl().addAll(validUrl);
|
instance.getUrl().addAll(validUrl);
|
||||||
}
|
}
|
||||||
return Arrays.asList(instance);
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String trimAndDecodeUrl(String url) {
|
protected String trimAndDecodeUrl(String url) {
|
||||||
|
|
|
@ -80,9 +80,6 @@ public class PatchRelationsApplication {
|
||||||
final Dataset<Relation> rels = readPath(spark, relationPath, Relation.class);
|
final Dataset<Relation> rels = readPath(spark, relationPath, Relation.class);
|
||||||
final Dataset<RelationIdMapping> idMapping = readPath(spark, idMappingPath, RelationIdMapping.class);
|
final Dataset<RelationIdMapping> idMapping = readPath(spark, idMappingPath, RelationIdMapping.class);
|
||||||
|
|
||||||
log.info("relations: {}", rels.count());
|
|
||||||
log.info("idMapping: {}", idMapping.count());
|
|
||||||
|
|
||||||
final Dataset<Relation> bySource = rels
|
final Dataset<Relation> bySource = rels
|
||||||
.joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
|
.joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
|
||||||
.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
|
.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
|
||||||
|
|
|
@ -22,5 +22,11 @@
|
||||||
"paramLongName": "targetPath",
|
"paramLongName": "targetPath",
|
||||||
"paramDescription": "the output path of the graph enriched",
|
"paramDescription": "the output path of the graph enriched",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "wp",
|
||||||
|
"paramLongName": "workingDir",
|
||||||
|
"paramDescription": "the working dir",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -51,6 +51,7 @@
|
||||||
<arg>--orcidPath</arg><arg>${orcidPath}</arg>
|
<arg>--orcidPath</arg><arg>${orcidPath}</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
||||||
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="reset_outputpath"/>
|
<ok to="reset_outputpath"/>
|
||||||
|
@ -89,6 +90,14 @@
|
||||||
<arg>${nameNode}/${graphPath}/project</arg>
|
<arg>${nameNode}/${graphPath}/project</arg>
|
||||||
<arg>${nameNode}/${targetPath}/project</arg>
|
<arg>${nameNode}/${targetPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
<ok to="copy_person"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
<action name="copy_person">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<arg>${nameNode}/${graphPath}/person</arg>
|
||||||
|
<arg>${nameNode}/${targetPath}/person</arg>
|
||||||
|
</distcp>
|
||||||
<ok to="copy_relation"/>
|
<ok to="copy_relation"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
|
@ -142,6 +142,7 @@
|
||||||
<path start="clean_datasource"/>
|
<path start="clean_datasource"/>
|
||||||
<path start="clean_organization"/>
|
<path start="clean_organization"/>
|
||||||
<path start="clean_project"/>
|
<path start="clean_project"/>
|
||||||
|
<path start="clean_person"/>
|
||||||
<path start="clean_relation"/>
|
<path start="clean_relation"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
|
@ -161,6 +162,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=15000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
||||||
|
@ -196,6 +198,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=8000
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
||||||
|
@ -231,6 +234,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=5000
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
||||||
|
@ -266,6 +270,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=2000
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
||||||
|
@ -301,6 +306,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=1000
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
||||||
|
@ -336,6 +342,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=1000
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
||||||
|
@ -371,6 +378,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=2000
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
||||||
|
@ -390,6 +398,42 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="clean_person">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Clean person</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${graphInputPath}/person</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${graphOutputPath}/person</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||||
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_clean"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<action name="clean_relation">
|
<action name="clean_relation">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -406,6 +450,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=20000
|
--conf spark.sql.shuffle.partitions=20000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
||||||
|
|
|
@ -102,6 +102,7 @@
|
||||||
<path start="import_datasource"/>
|
<path start="import_datasource"/>
|
||||||
<path start="import_organization"/>
|
<path start="import_organization"/>
|
||||||
<path start="import_project"/>
|
<path start="import_project"/>
|
||||||
|
<path start="import_person"/>
|
||||||
<path start="import_relation"/>
|
<path start="import_relation"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
|
@ -308,6 +309,35 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="import_person">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Import table person</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${inputPath}/person</arg>
|
||||||
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
<arg>--numPartitions</arg><arg>1000</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="join_import"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<action name="import_relation">
|
<action name="import_relation">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
|
|
@ -68,6 +68,7 @@
|
||||||
<path start="merge_datasource"/>
|
<path start="merge_datasource"/>
|
||||||
<path start="merge_organization"/>
|
<path start="merge_organization"/>
|
||||||
<path start="merge_project"/>
|
<path start="merge_project"/>
|
||||||
|
<path start="merge_person"/>
|
||||||
<path start="merge_relation"/>
|
<path start="merge_relation"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
|
@ -260,6 +261,33 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="merge_person">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Merge person</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/person</arg>
|
||||||
|
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/person</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${graphOutputPath}/person</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||||
|
<arg>--priority</arg><arg>${priority}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_merge"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<action name="merge_relation">
|
<action name="merge_relation">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
|
|
@ -649,6 +649,7 @@
|
||||||
<path start="merge_claims_datasource"/>
|
<path start="merge_claims_datasource"/>
|
||||||
<path start="merge_claims_organization"/>
|
<path start="merge_claims_organization"/>
|
||||||
<path start="merge_claims_project"/>
|
<path start="merge_claims_project"/>
|
||||||
|
<path start="merge_claims_person"/>
|
||||||
<path start="merge_claims_relation"/>
|
<path start="merge_claims_relation"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
|
@ -860,6 +861,32 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="merge_claims_person">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>MergeClaims_person</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory ${sparkExecutorMemory}
|
||||||
|
--executor-cores ${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=200
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
|
||||||
|
<arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
|
||||||
|
<arg>--outputRawGaphPath</arg><arg>${graphOutputPath}</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_merge"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<join name="wait_merge" to="decisionPatchRelations"/>
|
<join name="wait_merge" to="decisionPatchRelations"/>
|
||||||
|
|
||||||
<decision name="decisionPatchRelations">
|
<decision name="decisionPatchRelations">
|
||||||
|
|
|
@ -47,13 +47,15 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
||||||
log.info(s"orcidPath is '$orcidPath'")
|
log.info(s"orcidPath is '$orcidPath'")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info(s"targetPath is '$targetPath'")
|
log.info(s"targetPath is '$targetPath'")
|
||||||
|
val workingDir = parser.get("workingDir")
|
||||||
|
log.info(s"targetPath is '$workingDir'")
|
||||||
|
|
||||||
createTemporaryData(graphPath, orcidPath, targetPath)
|
createTemporaryData(graphPath, orcidPath, workingDir)
|
||||||
analisys(targetPath)
|
analisys(workingDir)
|
||||||
generateGraph(graphPath, targetPath)
|
generateGraph(graphPath, workingDir, targetPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def generateGraph(graphPath: String, targetPath: String): Unit = {
|
private def generateGraph(graphPath: String, workingDir: String, targetPath: String): Unit = {
|
||||||
|
|
||||||
ModelSupport.entityTypes.asScala
|
ModelSupport.entityTypes.asScala
|
||||||
.filter(e => ModelSupport.isResult(e._1))
|
.filter(e => ModelSupport.isResult(e._1))
|
||||||
|
@ -63,7 +65,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
||||||
|
|
||||||
val matched = spark.read
|
val matched = spark.read
|
||||||
.schema(Encoders.bean(classOf[ORCIDAuthorEnricherResult]).schema)
|
.schema(Encoders.bean(classOf[ORCIDAuthorEnricherResult]).schema)
|
||||||
.parquet(s"${targetPath}/${resultType}_matched")
|
.parquet(s"${workingDir}/${resultType}_matched")
|
||||||
.selectExpr("id", "enriched_author")
|
.selectExpr("id", "enriched_author")
|
||||||
|
|
||||||
spark.read
|
spark.read
|
||||||
|
|
|
@ -133,7 +133,7 @@ object SparkCreateInputGraph {
|
||||||
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
|
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
|
||||||
|
|
||||||
ds.groupByKey(_.getId)
|
ds.groupByKey(_.getId)
|
||||||
.mapGroups { (id, it) => MergeUtils.mergeGroup(id, it.asJava).asInstanceOf[T] }
|
.mapGroups { (id, it) => MergeUtils.mergeGroup(it.asJava).asInstanceOf[T] }
|
||||||
// .reduceGroups { (x: T, y: T) => MergeUtils.merge(x, y).asInstanceOf[T] }
|
// .reduceGroups { (x: T, y: T) => MergeUtils.merge(x, y).asInstanceOf[T] }
|
||||||
// .map(_)
|
// .map(_)
|
||||||
.write
|
.write
|
||||||
|
|
|
@ -30,6 +30,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.common.RelationInverse;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
|
||||||
|
@ -365,6 +367,40 @@ class MigrateDbEntitiesApplicationTest {
|
||||||
assertValidId(r2.getCollectedfrom().get(0).getKey());
|
assertValidId(r2.getCollectedfrom().get(0).getKey());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testProcessClaims_affiliation() throws Exception {
|
||||||
|
final List<TypedField> fields = prepareMocks("claimsrel_resultset_affiliation.json");
|
||||||
|
|
||||||
|
final List<Oaf> list = app.processClaims(rs);
|
||||||
|
|
||||||
|
assertEquals(2, list.size());
|
||||||
|
verifyMocks(fields);
|
||||||
|
|
||||||
|
assertTrue(list.get(0) instanceof Relation);
|
||||||
|
assertTrue(list.get(1) instanceof Relation);
|
||||||
|
|
||||||
|
final Relation r1 = (Relation) list.get(0);
|
||||||
|
final Relation r2 = (Relation) list.get(1);
|
||||||
|
|
||||||
|
assertValidId(r1.getSource());
|
||||||
|
assertValidId(r1.getTarget());
|
||||||
|
assertValidId(r2.getSource());
|
||||||
|
assertValidId(r2.getTarget());
|
||||||
|
assertNotNull(r1.getDataInfo());
|
||||||
|
assertNotNull(r2.getDataInfo());
|
||||||
|
assertNotNull(r1.getDataInfo().getTrust());
|
||||||
|
assertNotNull(r2.getDataInfo().getTrust());
|
||||||
|
assertEquals(r1.getSource(), r2.getTarget());
|
||||||
|
assertEquals(r2.getSource(), r1.getTarget());
|
||||||
|
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
|
||||||
|
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
|
||||||
|
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
||||||
|
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
||||||
|
|
||||||
|
assertValidId(r1.getCollectedfrom().get(0).getKey());
|
||||||
|
assertValidId(r2.getCollectedfrom().get(0).getKey());
|
||||||
|
}
|
||||||
|
|
||||||
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
|
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
|
||||||
final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
|
final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"field": "source_type",
|
||||||
|
"type": "string",
|
||||||
|
"value": "organization"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "source_id",
|
||||||
|
"type": "string",
|
||||||
|
"value": "openorgs____::b5ca9d4340e26454e367e2908ef3872f"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "target_type",
|
||||||
|
"type": "string",
|
||||||
|
"value": "software"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "target_id",
|
||||||
|
"type": "string",
|
||||||
|
"value": "userclaim___::bde53826d07c8cf47c99222a375cd2e8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "semantics",
|
||||||
|
"type": "string",
|
||||||
|
"value": "resultOrganization_affiliation_isAuthorInstitutionOf"
|
||||||
|
}
|
||||||
|
]
|
|
@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest {
|
||||||
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
||||||
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test def testDocumentationNames(): Unit = {
|
@Test def testDocumentationNames(): Unit = {
|
||||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
|
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
@ -167,8 +168,9 @@ public class CreateRelatedEntitiesJob_phase1 {
|
||||||
result
|
result
|
||||||
.getDescription()
|
.getDescription()
|
||||||
.stream()
|
.stream()
|
||||||
.findFirst()
|
.filter(d -> Objects.nonNull(d.getValue()))
|
||||||
.map(Field::getValue)
|
.map(Field::getValue)
|
||||||
|
.max(Comparator.comparingInt(String::length))
|
||||||
.ifPresent(
|
.ifPresent(
|
||||||
d -> re.setDescription(StringUtils.left(d, ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)));
|
d -> re.setDescription(StringUtils.left(d, ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)));
|
||||||
}
|
}
|
||||||
|
@ -231,6 +233,14 @@ public class CreateRelatedEntitiesJob_phase1 {
|
||||||
if (!f.isEmpty()) {
|
if (!f.isEmpty()) {
|
||||||
re.setFundingtree(f.stream().map(Field::getValue).collect(Collectors.toList()));
|
re.setFundingtree(f.stream().map(Field::getValue).collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
case person:
|
||||||
|
final Person person = (Person) entity;
|
||||||
|
|
||||||
|
re.setGivenName(person.getGivenName());
|
||||||
|
re.setFamilyName(person.getFamilyName());
|
||||||
|
re.setAlternativeNames(person.getAlternativeNames());
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return re;
|
return re;
|
||||||
|
|
|
@ -2,10 +2,12 @@
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits.MAX_RELATIONS_BY_RELCLASS;
|
||||||
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
|
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -15,11 +17,13 @@ import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
@ -27,11 +31,13 @@ import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
||||||
|
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
|
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
|
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||||
import eu.dnetlib.dhp.schema.solr.SolrRecord;
|
import eu.dnetlib.dhp.schema.solr.SolrRecord;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
@ -124,6 +130,9 @@ public class PayloadConverterJob {
|
||||||
.map(Oaf::getDataInfo)
|
.map(Oaf::getDataInfo)
|
||||||
.map(DataInfo::getDeletedbyinference)
|
.map(DataInfo::getDeletedbyinference)
|
||||||
.orElse(false))
|
.orElse(false))
|
||||||
|
.map(
|
||||||
|
(MapFunction<JoinedEntity, JoinedEntity>) PayloadConverterJob::pruneRelatedEntities,
|
||||||
|
Encoders.kryo(JoinedEntity.class))
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
|
(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
|
||||||
recordFactory.build(je, validateXML),
|
recordFactory.build(je, validateXML),
|
||||||
|
@ -139,6 +148,32 @@ public class PayloadConverterJob {
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This function iterates through the RelatedEntityWrapper(s) associated to the JoinedEntity and rules out
|
||||||
|
* those exceeding the maximum allowed frequency defined in eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits#MAX_RELATIONS_BY_RELCLASS
|
||||||
|
*/
|
||||||
|
private static JoinedEntity pruneRelatedEntities(JoinedEntity je) {
|
||||||
|
Map<String, Long> freqs = Maps.newHashMap();
|
||||||
|
List<RelatedEntityWrapper> rew = Lists.newArrayList();
|
||||||
|
|
||||||
|
if (je.getLinks() != null) {
|
||||||
|
je.getLinks().forEach(link -> {
|
||||||
|
final String relClass = link.getRelation().getRelClass();
|
||||||
|
|
||||||
|
final Long count = freqs.getOrDefault(relClass, 0L);
|
||||||
|
final Long max = MAX_RELATIONS_BY_RELCLASS.getOrDefault(relClass, Long.MAX_VALUE);
|
||||||
|
|
||||||
|
if (count <= max) {
|
||||||
|
rew.add(link);
|
||||||
|
freqs.put(relClass, freqs.getOrDefault(relClass, 0L) + 1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
je.setLinks(rew);
|
||||||
|
}
|
||||||
|
|
||||||
|
return je;
|
||||||
|
}
|
||||||
|
|
||||||
private static void removeOutputDir(final SparkSession spark, final String path) {
|
private static void removeOutputDir(final SparkSession spark, final String path) {
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||||
import eu.dnetlib.dhp.schema.solr.*;
|
import eu.dnetlib.dhp.schema.solr.*;
|
||||||
import eu.dnetlib.dhp.schema.solr.AccessRight;
|
import eu.dnetlib.dhp.schema.solr.AccessRight;
|
||||||
import eu.dnetlib.dhp.schema.solr.Author;
|
import eu.dnetlib.dhp.schema.solr.Author;
|
||||||
|
@ -37,6 +38,8 @@ import eu.dnetlib.dhp.schema.solr.Measure;
|
||||||
import eu.dnetlib.dhp.schema.solr.OpenAccessColor;
|
import eu.dnetlib.dhp.schema.solr.OpenAccessColor;
|
||||||
import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
|
import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
|
||||||
import eu.dnetlib.dhp.schema.solr.Organization;
|
import eu.dnetlib.dhp.schema.solr.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.solr.Person;
|
||||||
|
import eu.dnetlib.dhp.schema.solr.PersonTopic;
|
||||||
import eu.dnetlib.dhp.schema.solr.Pid;
|
import eu.dnetlib.dhp.schema.solr.Pid;
|
||||||
import eu.dnetlib.dhp.schema.solr.Project;
|
import eu.dnetlib.dhp.schema.solr.Project;
|
||||||
import eu.dnetlib.dhp.schema.solr.Result;
|
import eu.dnetlib.dhp.schema.solr.Result;
|
||||||
|
@ -89,6 +92,8 @@ public class ProvisionModelSupport {
|
||||||
r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
|
r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
|
||||||
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) {
|
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) {
|
||||||
r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
|
r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
|
||||||
|
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Person) {
|
||||||
|
r.setPerson(mapPerson((eu.dnetlib.dhp.schema.oaf.Person) e));
|
||||||
}
|
}
|
||||||
r
|
r
|
||||||
.setLinks(
|
.setLinks(
|
||||||
|
@ -108,7 +113,7 @@ public class ProvisionModelSupport {
|
||||||
RelatedRecord rr = new RelatedRecord();
|
RelatedRecord rr = new RelatedRecord();
|
||||||
|
|
||||||
final RelatedEntity re = rew.getTarget();
|
final RelatedEntity re = rew.getTarget();
|
||||||
final RecordType relatedRecordType = RecordType.valueOf(re.getType());
|
final RecordType relatedRecordType = RecordType.fromString(re.getType());
|
||||||
final Relation relation = rew.getRelation();
|
final Relation relation = rew.getRelation();
|
||||||
final String relationProvenance = Optional
|
final String relationProvenance = Optional
|
||||||
.ofNullable(relation.getDataInfo())
|
.ofNullable(relation.getDataInfo())
|
||||||
|
@ -150,6 +155,17 @@ public class ProvisionModelSupport {
|
||||||
rr.setPublisher(re.getPublisher());
|
rr.setPublisher(re.getPublisher());
|
||||||
rr.setResulttype(mapQualifier(re.getResulttype()));
|
rr.setResulttype(mapQualifier(re.getResulttype()));
|
||||||
rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null));
|
rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null));
|
||||||
|
rr.setDescription(StringUtils.left(re.getDescription(), ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH));
|
||||||
|
rr
|
||||||
|
.setAuthor(
|
||||||
|
Optional
|
||||||
|
.ofNullable(re.getAuthor())
|
||||||
|
.map(
|
||||||
|
aa -> aa
|
||||||
|
.stream()
|
||||||
|
.limit(ModelHardLimits.MAX_RELATED_AUTHORS)
|
||||||
|
.collect(Collectors.toList()))
|
||||||
|
.orElse(null));
|
||||||
|
|
||||||
if (relation.getValidated() == null) {
|
if (relation.getValidated() == null) {
|
||||||
relation.setValidated(false);
|
relation.setValidated(false);
|
||||||
|
@ -185,6 +201,18 @@ public class ProvisionModelSupport {
|
||||||
return ps;
|
return ps;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Person mapPerson(eu.dnetlib.dhp.schema.oaf.Person p) {
|
||||||
|
Person ps = new Person();
|
||||||
|
ps.setFamilyName(p.getFamilyName());
|
||||||
|
ps.setGivenName(p.getGivenName());
|
||||||
|
ps.setAlternativeNames(p.getAlternativeNames());
|
||||||
|
ps.setBiography(p.getBiography());
|
||||||
|
ps.setConsent(p.getConsent());
|
||||||
|
// ps.setSubject(...));
|
||||||
|
|
||||||
|
return ps;
|
||||||
|
}
|
||||||
|
|
||||||
private static Funding mapFunding(List<String> fundingtree, VocabularyGroup vocs) {
|
private static Funding mapFunding(List<String> fundingtree, VocabularyGroup vocs) {
|
||||||
SAXReader reader = new SAXReader();
|
SAXReader reader = new SAXReader();
|
||||||
return Optional
|
return Optional
|
||||||
|
@ -378,6 +406,7 @@ public class ProvisionModelSupport {
|
||||||
rs.setPubliclyFunded(r.getPubliclyFunded());
|
rs.setPubliclyFunded(r.getPubliclyFunded());
|
||||||
rs.setTransformativeAgreement(r.getTransformativeAgreement());
|
rs.setTransformativeAgreement(r.getTransformativeAgreement());
|
||||||
rs.setExternalReference(mapExternalReference(r.getExternalReference()));
|
rs.setExternalReference(mapExternalReference(r.getExternalReference()));
|
||||||
|
rs.setBestinstancetype(mapQualifier(r.getBestInstancetype()));
|
||||||
rs.setInstance(mapInstances(r.getInstance()));
|
rs.setInstance(mapInstances(r.getInstance()));
|
||||||
|
|
||||||
if (r instanceof Publication) {
|
if (r instanceof Publication) {
|
||||||
|
@ -667,14 +696,23 @@ public class ProvisionModelSupport {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<Author> asAuthor(List<eu.dnetlib.dhp.schema.oaf.Author> authorList) {
|
private static List<Author> asAuthor(List<eu.dnetlib.dhp.schema.oaf.Author> authorList) {
|
||||||
|
return asAuthor(authorList, ModelHardLimits.MAX_AUTHORS);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Author> asAuthor(List<eu.dnetlib.dhp.schema.oaf.Author> authorList, int maxAuthors) {
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(authorList)
|
.ofNullable(authorList)
|
||||||
.map(
|
.map(
|
||||||
authors -> authors
|
authors -> authors
|
||||||
.stream()
|
.stream()
|
||||||
|
.limit(maxAuthors)
|
||||||
.map(
|
.map(
|
||||||
a -> Author
|
a -> Author
|
||||||
.newInstance(a.getFullname(), a.getName(), a.getSurname(), a.getRank(), asPid(a.getPid())))
|
.newInstance(
|
||||||
|
StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH),
|
||||||
|
a.getName(),
|
||||||
|
a.getSurname(),
|
||||||
|
a.getRank(), asPid(a.getPid())))
|
||||||
.collect(Collectors.toList()))
|
.collect(Collectors.toList()))
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,6 +51,11 @@ public class RelatedEntity implements Serializable {
|
||||||
private Qualifier contracttype;
|
private Qualifier contracttype;
|
||||||
private List<String> fundingtree;
|
private List<String> fundingtree;
|
||||||
|
|
||||||
|
// person
|
||||||
|
private String givenName;
|
||||||
|
private String familyName;
|
||||||
|
private List<String> alternativeNames;
|
||||||
|
|
||||||
public String getId() {
|
public String getId() {
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
@ -251,6 +256,30 @@ public class RelatedEntity implements Serializable {
|
||||||
this.fundingtree = fundingtree;
|
this.fundingtree = fundingtree;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getGivenName() {
|
||||||
|
return givenName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setGivenName(String givenName) {
|
||||||
|
this.givenName = givenName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFamilyName() {
|
||||||
|
return familyName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFamilyName(String familyName) {
|
||||||
|
this.familyName = familyName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getAlternativeNames() {
|
||||||
|
return alternativeNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAlternativeNames(List<String> alternativeNames) {
|
||||||
|
this.alternativeNames = alternativeNames;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o)
|
if (this == o)
|
||||||
|
@ -280,7 +309,10 @@ public class RelatedEntity implements Serializable {
|
||||||
&& Objects.equal(code, that.code)
|
&& Objects.equal(code, that.code)
|
||||||
&& Objects.equal(acronym, that.acronym)
|
&& Objects.equal(acronym, that.acronym)
|
||||||
&& Objects.equal(contracttype, that.contracttype)
|
&& Objects.equal(contracttype, that.contracttype)
|
||||||
&& Objects.equal(fundingtree, that.fundingtree);
|
&& Objects.equal(fundingtree, that.fundingtree)
|
||||||
|
&& Objects.equal(givenName, that.givenName)
|
||||||
|
&& Objects.equal(familyName, that.familyName)
|
||||||
|
&& Objects.equal(alternativeNames, that.alternativeNames);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -309,6 +341,9 @@ public class RelatedEntity implements Serializable {
|
||||||
code,
|
code,
|
||||||
acronym,
|
acronym,
|
||||||
contracttype,
|
contracttype,
|
||||||
fundingtree);
|
fundingtree,
|
||||||
|
familyName,
|
||||||
|
givenName,
|
||||||
|
alternativeNames);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1035,6 +1035,48 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
case person:
|
||||||
|
final Person person = (Person) entity;
|
||||||
|
|
||||||
|
if (person.getGivenName() != null) {
|
||||||
|
metadata.add(XmlSerializationUtils.asXmlElement("givenname", person.getGivenName()));
|
||||||
|
}
|
||||||
|
if (person.getFamilyName() != null) {
|
||||||
|
metadata.add(XmlSerializationUtils.asXmlElement("familyname", person.getFamilyName()));
|
||||||
|
}
|
||||||
|
if (person.getAlternativeNames() != null) {
|
||||||
|
metadata
|
||||||
|
.addAll(
|
||||||
|
person
|
||||||
|
.getAlternativeNames()
|
||||||
|
.stream()
|
||||||
|
.map(altName -> XmlSerializationUtils.asXmlElement("alternativename", altName))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
if (person.getBiography() != null) {
|
||||||
|
metadata.add(XmlSerializationUtils.asXmlElement("biography", person.getBiography()));
|
||||||
|
}
|
||||||
|
if (person.getSubject() != null) {
|
||||||
|
metadata
|
||||||
|
.addAll(
|
||||||
|
person
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.map(pt -> {
|
||||||
|
List<Tuple2<String, String>> attrs = Lists.newArrayList();
|
||||||
|
attrs.add(new Tuple2<>("schema", pt.getSchema()));
|
||||||
|
attrs.add(new Tuple2<>("value", pt.getValue()));
|
||||||
|
attrs.add(new Tuple2<>("fromYear", String.valueOf(pt.getFromYear())));
|
||||||
|
attrs.add(new Tuple2<>("toYear", String.valueOf(pt.getToYear())));
|
||||||
|
return XmlSerializationUtils.asXmlElement("subject", attrs);
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
if (person.getConsent() != null) {
|
||||||
|
metadata.add(XmlSerializationUtils.asXmlElement("consent", String.valueOf(person.getConsent())));
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException("invalid entity type: " + type);
|
throw new IllegalArgumentException("invalid entity type: " + type);
|
||||||
|
@ -1240,6 +1282,25 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case person:
|
||||||
|
|
||||||
|
if (isNotBlank(re.getGivenName())) {
|
||||||
|
metadata.add(XmlSerializationUtils.asXmlElement("givenname", re.getGivenName()));
|
||||||
|
}
|
||||||
|
if (isNotBlank(re.getFamilyName())) {
|
||||||
|
metadata.add(XmlSerializationUtils.asXmlElement("familyname", re.getFamilyName()));
|
||||||
|
}
|
||||||
|
if (re.getAlternativeNames() != null && !re.getAlternativeNames().isEmpty()) {
|
||||||
|
metadata
|
||||||
|
.addAll(
|
||||||
|
re
|
||||||
|
.getAlternativeNames()
|
||||||
|
.stream()
|
||||||
|
.map(name -> XmlSerializationUtils.asXmlElement("alternativename", name))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException("invalid target type: " + targetType);
|
throw new IllegalArgumentException("invalid target type: " + targetType);
|
||||||
}
|
}
|
||||||
|
|
|
@ -180,6 +180,7 @@
|
||||||
<path start="join_relation_datasource"/>
|
<path start="join_relation_datasource"/>
|
||||||
<path start="join_relation_organization"/>
|
<path start="join_relation_organization"/>
|
||||||
<path start="join_relation_project"/>
|
<path start="join_relation_project"/>
|
||||||
|
<path start="join_relation_person"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
<action name="join_relation_publication">
|
<action name="join_relation_publication">
|
||||||
|
@ -378,6 +379,34 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="join_relation_person">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Join[relation.target = person.id]</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
|
||||||
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCoresForJoining}
|
||||||
|
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||||
|
--driver-memory=${sparkDriverMemoryForJoining}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
|
--conf spark.network.timeout=${sparkNetworkTimeout}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
|
||||||
|
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/person</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/join_partial/person</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_joins"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<join name="wait_joins" to="fork_join_all_entities"/>
|
<join name="wait_joins" to="fork_join_all_entities"/>
|
||||||
|
|
||||||
<fork name="fork_join_all_entities">
|
<fork name="fork_join_all_entities">
|
||||||
|
@ -388,6 +417,7 @@
|
||||||
<path start="join_datasource_relations"/>
|
<path start="join_datasource_relations"/>
|
||||||
<path start="join_organization_relations"/>
|
<path start="join_organization_relations"/>
|
||||||
<path start="join_project_relations"/>
|
<path start="join_project_relations"/>
|
||||||
|
<path start="join_person_relations"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
<action name="join_publication_relations">
|
<action name="join_publication_relations">
|
||||||
|
@ -593,6 +623,35 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="join_person_relations">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Join[person.id = relatedEntity.source]</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2</class>
|
||||||
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCoresForJoining}
|
||||||
|
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||||
|
--driver-memory=${sparkDriverMemoryForJoining}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
|
--conf spark.network.timeout=${sparkNetworkTimeout}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/person</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||||
|
<arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/join_entities/person</arg>
|
||||||
|
<arg>--numPartitions</arg><arg>10000</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_join_phase2"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<join name="wait_join_phase2" to="create_payloads"/>
|
<join name="wait_join_phase2" to="create_payloads"/>
|
||||||
|
|
||||||
<action name="create_payloads">
|
<action name="create_payloads">
|
||||||
|
|
|
@ -1,63 +0,0 @@
|
||||||
#/usr/bin/bash
|
|
||||||
|
|
||||||
# Read log files from ranking scripts and create a two-line file
|
|
||||||
# with score limits for the various measures. To be used by Kleanthis
|
|
||||||
|
|
||||||
attrank_file=$(ls *attrank*.log);
|
|
||||||
pr_file=$(ls *pagerank*.log)
|
|
||||||
ram_file=$(ls *ram*.log);
|
|
||||||
cc_file=$(ls *cc*.log);
|
|
||||||
impulse_file=$(ls *impulse*.log);
|
|
||||||
|
|
||||||
echo
|
|
||||||
echo "-----------------------------"
|
|
||||||
echo "Attrank file:${attrank_file}";
|
|
||||||
echo "PageRank file:${pr_file}";
|
|
||||||
echo "RAM file:${ram_file}";
|
|
||||||
echo "CC file:${cc_file}";
|
|
||||||
echo "Impulse file:${impulse_file}";
|
|
||||||
echo "-----------------------------"
|
|
||||||
echo
|
|
||||||
echo
|
|
||||||
|
|
||||||
# output file will be called score_limits.csv
|
|
||||||
echo -e "influence_top001\tinfluence_top01\tinfluence_top1\tinfluence_top10\tpopularity_top001\tpopularity_top01\tpopularity_top1\tpopularity_top10\timpulse_top001\timpulse_top01\timpulse_top1\timpulse_top10\tcc_top001\tcc_top01\tcc_top1\tcc_top10" > score_limits.csv
|
|
||||||
# ---------------------------------------------------- #
|
|
||||||
# Get respective score limits (we don't need RAM)
|
|
||||||
inf_001=$(grep "^0.01%" ${pr_file} | cut -f 2);
|
|
||||||
inf_01=$(grep "^0.1%" ${pr_file} | cut -f 2);
|
|
||||||
inf_1=$(grep "^1%" ${pr_file} | cut -f 2);
|
|
||||||
inf_10=$(grep "^10%" ${pr_file} | cut -f 2);
|
|
||||||
echo "Influnence limits:"
|
|
||||||
echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}";
|
|
||||||
# ---------------------------------------------------- #
|
|
||||||
pop_001=$(grep "^0.01%" ${attrank_file} | cut -f 2);
|
|
||||||
pop_01=$(grep "^0.1%" ${attrank_file} | cut -f 2);
|
|
||||||
pop_1=$(grep "^1%" ${attrank_file} | cut -f 2);
|
|
||||||
pop_10=$(grep "^10%" ${attrank_file} | cut -f 2);
|
|
||||||
echo "Popularity limits:";
|
|
||||||
echo -e "${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}";
|
|
||||||
# ---------------------------------------------------- #
|
|
||||||
imp_001=$(grep "^0.01%" ${impulse_file} | cut -f 2);
|
|
||||||
imp_01=$(grep "^0.1%" ${impulse_file} | cut -f 2);
|
|
||||||
imp_1=$(grep "^1%" ${impulse_file} | cut -f 2);
|
|
||||||
imp_10=$(grep "^10%" ${impulse_file} | cut -f 2);
|
|
||||||
echo "Popularity limits:";
|
|
||||||
echo -e "${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}";
|
|
||||||
# ---------------------------------------------------- #
|
|
||||||
cc_001=$(grep "^0.01%" ${cc_file} | cut -f 2);
|
|
||||||
cc_01=$(grep "^0.1%" ${cc_file} | cut -f 2);
|
|
||||||
cc_1=$(grep "^1%" ${cc_file} | cut -f 2);
|
|
||||||
cc_10=$(grep "^10%" ${cc_file} | cut -f 2);
|
|
||||||
echo "Popularity limits:";
|
|
||||||
echo -e "${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}";
|
|
||||||
# ---------------------------------------------------- #
|
|
||||||
|
|
||||||
echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}\t${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}\t${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}\t${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}" >> score_limits.csv
|
|
||||||
|
|
||||||
echo
|
|
||||||
echo "score_limits.csv contents:"
|
|
||||||
cat score_limits.csv
|
|
||||||
|
|
||||||
echo;
|
|
||||||
echo;
|
|
|
@ -1,60 +0,0 @@
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from pyspark.sql import SparkSession
|
|
||||||
from pyspark import SparkConf, SparkContext
|
|
||||||
|
|
||||||
if len(sys.argv) != 3:
|
|
||||||
print("Usage: map_openaire_ids_to_dois.py <hdfs_src_dir> <hdfs_output_dir>")
|
|
||||||
sys.exit(-1)
|
|
||||||
|
|
||||||
conf = SparkConf().setAppName('BIP!: Map OpenAIRE IDs to DOIs')
|
|
||||||
sc = SparkContext(conf = conf)
|
|
||||||
spark = SparkSession.builder.appName('BIP!: Map OpenAIRE IDs to DOIs').getOrCreate()
|
|
||||||
sc.setLogLevel('OFF')
|
|
||||||
|
|
||||||
src_dir = sys.argv[1]
|
|
||||||
output = sys.argv[2]
|
|
||||||
|
|
||||||
# src_dir = "/tmp/beta_provision/graph/21_graph_cleaned/"
|
|
||||||
# output = '/tmp/openaireid_to_dois/'
|
|
||||||
|
|
||||||
def transform(doc):
|
|
||||||
|
|
||||||
# get publication year from 'doc.dateofacceptance.value'
|
|
||||||
dateofacceptance = doc.get('dateofacceptance', {}).get('value')
|
|
||||||
|
|
||||||
year = 0
|
|
||||||
|
|
||||||
if (dateofacceptance is not None):
|
|
||||||
year = dateofacceptance.split('-')[0]
|
|
||||||
|
|
||||||
# for each pid get 'pid.value' if 'pid.qualifier.classid' equals to 'doi'
|
|
||||||
dois = [ pid['value'] for pid in doc.get('pid', []) if (pid.get('qualifier', {}).get('classid') == 'doi' and pid['value'] is not None)]
|
|
||||||
|
|
||||||
num_dois = len(dois)
|
|
||||||
|
|
||||||
# exlcude openaire ids that do not correspond to DOIs
|
|
||||||
if (num_dois == 0):
|
|
||||||
return None
|
|
||||||
|
|
||||||
fields = [ doc['id'], str(num_dois), chr(0x02).join(dois), str(year) ]
|
|
||||||
|
|
||||||
return '\t'.join([ v.encode('utf-8') for v in fields ])
|
|
||||||
|
|
||||||
docs = None
|
|
||||||
|
|
||||||
for result_type in ["publication", "dataset", "software", "otherresearchproduct"]:
|
|
||||||
|
|
||||||
tmp = sc.textFile(src_dir + result_type).map(json.loads)
|
|
||||||
|
|
||||||
if (docs is None):
|
|
||||||
docs = tmp
|
|
||||||
else:
|
|
||||||
# append all result types in one RDD
|
|
||||||
docs = docs.union(tmp)
|
|
||||||
|
|
||||||
docs = docs.filter(lambda d: d.get('dataInfo', {}).get('deletedbyinference') == False and d.get('dataInfo', {}).get('invisible') == False)
|
|
||||||
|
|
||||||
docs = docs.map(transform).filter(lambda d: d is not None)
|
|
||||||
|
|
||||||
docs.saveAsTextFile(output)
|
|
|
@ -1,168 +0,0 @@
|
||||||
#!/usr/bin/python
|
|
||||||
# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
|
|
||||||
# and uses this mapping to create doi-based score files in the format required by BiP! DB.
|
|
||||||
# This is done by reading each openaire-id based ranking file and joining the openaire based
|
|
||||||
# score and classes to all the corresponding dois.
|
|
||||||
#################################################################################################
|
|
||||||
# Imports
|
|
||||||
import sys
|
|
||||||
|
|
||||||
# Sparksession lib to communicate with cluster via session object
|
|
||||||
from pyspark.sql import SparkSession
|
|
||||||
|
|
||||||
# Import sql types to define schemas
|
|
||||||
from pyspark.sql.types import *
|
|
||||||
|
|
||||||
# Import sql functions with shorthand alias
|
|
||||||
import pyspark.sql.functions as F
|
|
||||||
|
|
||||||
from pyspark.sql.functions import max
|
|
||||||
# from pyspark.sql.functions import udf
|
|
||||||
#################################################################################################
|
|
||||||
#################################################################################################
|
|
||||||
# Clean up directory name - no longer needed in final workflow version
|
|
||||||
'''
|
|
||||||
def clean_directory_name(dir_name):
|
|
||||||
# We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_*
|
|
||||||
# and we need to keep the parts in *
|
|
||||||
|
|
||||||
|
|
||||||
dir_name_parts = dir_name.split('_')
|
|
||||||
dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
|
|
||||||
|
|
||||||
dir_name = dir_name.replace("openaire_id_graph", "openaire_ids")
|
|
||||||
clean_name = dir_name + ".txt.gz"
|
|
||||||
|
|
||||||
# clean_name = '_'.join(dir_name_parts)
|
|
||||||
|
|
||||||
# if '_ids' not in clean_name:
|
|
||||||
# clean_name = clean_name.replace('id_', 'ids_')
|
|
||||||
|
|
||||||
# clean_name = clean_name.replace('.txt', '')
|
|
||||||
# clean_name = clean_name.replace('.gz', '')
|
|
||||||
|
|
||||||
# if 'openaire_ids_' in clean_name:
|
|
||||||
# clean_name = clean_name.replace('openaire_ids_', '')
|
|
||||||
# clean_name = clean_name + '.txt.gz'
|
|
||||||
# else:
|
|
||||||
# clean_name = clean_name + '.txt.gz'
|
|
||||||
|
|
||||||
return clean_name
|
|
||||||
'''
|
|
||||||
#################################################################################################
|
|
||||||
if len(sys.argv) < 3:
|
|
||||||
print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
|
|
||||||
sys.exit(-1)
|
|
||||||
|
|
||||||
# Read arguments
|
|
||||||
synonyms_folder = sys.argv[1]
|
|
||||||
num_partitions = int(sys.argv[2])
|
|
||||||
input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]]
|
|
||||||
# input_file_list = [clean_directory_name(item) for item in input_file_list]
|
|
||||||
|
|
||||||
# Prepare output specific variables
|
|
||||||
output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
|
|
||||||
output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list]
|
|
||||||
|
|
||||||
# --- INFO MESSAGES --- #
|
|
||||||
print ("\n\n----------------------------")
|
|
||||||
print ("Mpping openaire ids to DOIs")
|
|
||||||
print ("Reading input from: " + synonyms_folder)
|
|
||||||
print ("Num partitions: " + str(num_partitions))
|
|
||||||
print ("Input files:" + " -- ".join(input_file_list))
|
|
||||||
print ("Output files: " + " -- ".join(output_file_list))
|
|
||||||
print ("----------------------------\n\n")
|
|
||||||
#######################################################################################
|
|
||||||
# We weill define the following schemas:
|
|
||||||
# --> the schema of the openaire - doi mapping file [string - int - doi_list] (the separator of the doi-list is a non printable character)
|
|
||||||
# --> a schema for floating point ranking scores [string - float - string] (the latter string is the class)
|
|
||||||
# --> a schema for integer ranking scores [string - int - string] (the latter string is the class)
|
|
||||||
|
|
||||||
float_schema = StructType([
|
|
||||||
StructField('id', StringType(), False),
|
|
||||||
StructField('score', FloatType(), False),
|
|
||||||
StructField('class', StringType(), False)
|
|
||||||
])
|
|
||||||
|
|
||||||
int_schema = StructType([
|
|
||||||
StructField('id', StringType(), False),
|
|
||||||
StructField('score', IntegerType(), False),
|
|
||||||
StructField('class', StringType(), False)
|
|
||||||
])
|
|
||||||
|
|
||||||
# This schema concerns the output of the file
|
|
||||||
# containing the number of references of each doi
|
|
||||||
synonyms_schema = StructType([
|
|
||||||
StructField('id', StringType(), False),
|
|
||||||
StructField('num_synonyms', IntegerType(), False),
|
|
||||||
StructField('doi_list', StringType(), False),
|
|
||||||
])
|
|
||||||
#######################################################################################
|
|
||||||
# Start spark session
|
|
||||||
spark = SparkSession.builder.appName('Map openaire scores to DOIs').getOrCreate()
|
|
||||||
# Set Log Level for spark session
|
|
||||||
spark.sparkContext.setLogLevel('WARN')
|
|
||||||
#######################################################################################
|
|
||||||
# MAIN Program
|
|
||||||
|
|
||||||
# Read and repartition the synonym folder - also cache it since we will need to perform multiple joins
|
|
||||||
synonym_df = spark.read.schema(synonyms_schema).option('delimiter', '\t').csv(synonyms_folder)
|
|
||||||
synonym_df = synonym_df.select('id', F.split(F.col('doi_list'), chr(0x02)).alias('doi_list'))
|
|
||||||
synonym_df = synonym_df.select('id', F.explode('doi_list').alias('doi')).repartition(num_partitions, 'id').cache()
|
|
||||||
|
|
||||||
# TESTING
|
|
||||||
# print ("Synonyms: " + str(synonym_df.count()))
|
|
||||||
# print ("DF looks like this:" )
|
|
||||||
# synonym_df.show(1000, False)
|
|
||||||
|
|
||||||
print ("\n\n-----------------------------")
|
|
||||||
# Now we need to join the score files on the openaire-id with the synonyms and then keep
|
|
||||||
# only doi - score - class and write this to the output
|
|
||||||
for offset, input_file in enumerate(input_file_list):
|
|
||||||
|
|
||||||
print ("Mapping scores from " + input_file)
|
|
||||||
|
|
||||||
# Select correct schema
|
|
||||||
schema = int_schema
|
|
||||||
if "attrank" in input_file.lower() or "pr" in input_file.lower() or "ram" in input_file.lower():
|
|
||||||
schema = float_schema
|
|
||||||
|
|
||||||
# Load file to dataframe
|
|
||||||
ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id')
|
|
||||||
|
|
||||||
# Get max score
|
|
||||||
max_score = ranking_df.select(max('score').alias('max')).collect()[0]['max']
|
|
||||||
print ("Max Score for " + str(input_file) + " is " + str(max_score))
|
|
||||||
|
|
||||||
# TESTING
|
|
||||||
# print ("Loaded df sample:")
|
|
||||||
# ranking_df.show(1000, False)
|
|
||||||
|
|
||||||
# Join scores to synonyms and keep required fields
|
|
||||||
doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'class').repartition(num_partitions, 'doi').cache()
|
|
||||||
# Write output
|
|
||||||
output_file = output_file_list[offset]
|
|
||||||
print ("Writing to: " + output_file)
|
|
||||||
doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
|
|
||||||
|
|
||||||
# Creata another file for the bip update process
|
|
||||||
ranking_df = ranking_df.select('id', 'score', F.lit(F.col('score')/max_score).alias('normalized_score'), 'class', F.col('class').alias('class_dup'))
|
|
||||||
doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'normalized_score', 'class', 'class_dup').repartition(num_partitions, 'doi').cache()
|
|
||||||
output_file = output_file.replace(".txt.gz", "_for_bip_update.txt.gz")
|
|
||||||
print ("Writing bip update to: " + output_file)
|
|
||||||
doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
|
|
||||||
|
|
||||||
|
|
||||||
# Free memory?
|
|
||||||
ranking_df.unpersist(True)
|
|
||||||
|
|
||||||
print ("-----------------------------")
|
|
||||||
print ("\n\nFinished!\n\n")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,10 +17,6 @@
|
||||||
<name>openaireGraphInputPath</name>
|
<name>openaireGraphInputPath</name>
|
||||||
<value>${nameNode}/${workingDir}/openaire_id_graph</value>
|
<value>${nameNode}/${workingDir}/openaire_id_graph</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>synonymFolder</name>
|
|
||||||
<value>${nameNode}/${workingDir}/openaireid_to_dois/</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>checkpointDir</name>
|
<name>checkpointDir</name>
|
||||||
<value>${nameNode}/${workingDir}/check/</value>
|
<value>${nameNode}/${workingDir}/check/</value>
|
||||||
|
@ -32,29 +28,34 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
|
<!-- Start using a decision node, to determine from which point onwards a job will continue -->
|
||||||
<start to="entry-point-decision" />
|
<start to="entry-point-decision" />
|
||||||
|
|
||||||
<decision name="entry-point-decision">
|
<decision name="entry-point-decision">
|
||||||
<switch>
|
<switch>
|
||||||
<!-- The default will be set as the normal start, a.k.a. get-doi-synonyms -->
|
|
||||||
<!-- If any different condition is set, go to the corresponding start -->
|
<!-- Start from creating the citation network (i.e., normal execution should start from here) -->
|
||||||
|
<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
|
||||||
|
|
||||||
|
<!-- Different citation-based impact indicators are computed -->
|
||||||
<case to="spark-cc">${wf:conf('resume') eq "cc"}</case>
|
<case to="spark-cc">${wf:conf('resume') eq "cc"}</case>
|
||||||
<case to="spark-ram">${wf:conf('resume') eq "ram"}</case>
|
<case to="spark-ram">${wf:conf('resume') eq "ram"}</case>
|
||||||
<case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case>
|
<case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case>
|
||||||
<case to="spark-pagerank">${wf:conf('resume') eq "pagerank"}</case>
|
<case to="spark-pagerank">${wf:conf('resume') eq "pagerank"}</case>
|
||||||
<case to="spark-attrank">${wf:conf('resume') eq "attrank"}</case>
|
<case to="spark-attrank">${wf:conf('resume') eq "attrank"}</case>
|
||||||
<!-- <case to="iterative-rankings">${wf:conf('resume') eq "rankings-iterative"}</case> -->
|
|
||||||
<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
|
|
||||||
<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
|
|
||||||
<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
|
|
||||||
<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
|
|
||||||
|
|
||||||
<!-- Aggregation of impact scores on the project level -->
|
<!-- Format the results appropriately before transforming them to action sets -->
|
||||||
|
<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
|
||||||
|
|
||||||
|
<!-- Aggregation of impact scores on the project level -->
|
||||||
<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
|
<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
|
||||||
|
|
||||||
|
<!-- Create action sets -->
|
||||||
<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
|
<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
|
||||||
|
|
||||||
|
<!-- The default will be set as the normal start, a.k.a. create-openaire-ranking-graph -->
|
||||||
<default to="create-openaire-ranking-graph" />
|
<default to="create-openaire-ranking-graph" />
|
||||||
|
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
|
||||||
|
@ -295,18 +296,11 @@
|
||||||
<capture-output/>
|
<capture-output/>
|
||||||
</shell>
|
</shell>
|
||||||
|
|
||||||
<ok to="format-result-files" />
|
<ok to="format-json-files" />
|
||||||
<error to="filename-getting-error" />
|
<error to="filename-getting-error" />
|
||||||
|
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
|
|
||||||
<fork name="format-result-files">
|
|
||||||
<path start="format-bip-files"/>
|
|
||||||
<path start="format-json-files"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Format json files -->
|
<!-- Format json files -->
|
||||||
<!-- Two parts: a) format files b) make the file endings .json.gz -->
|
<!-- Two parts: a) format files b) make the file endings .json.gz -->
|
||||||
<action name="format-json-files">
|
<action name="format-json-files">
|
||||||
|
@ -345,139 +339,8 @@
|
||||||
<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
|
<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
|
||||||
</spark>
|
</spark>
|
||||||
|
|
||||||
<ok to="join-file-formatting" />
|
|
||||||
<error to="json-formatting-fail" />
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<!-- This is the second line of parallel workflow execution where we create the BiP! DB files -->
|
|
||||||
<action name="format-bip-files">
|
|
||||||
<!-- This is required as a tag for spark jobs, regardless of programming language -->
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
|
|
||||||
<!-- using configs from an example on openaire -->
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
|
|
||||||
<!-- This is the name of our job -->
|
|
||||||
<name>Format Ranking Results BiP! DB</name>
|
|
||||||
<!-- Script name goes here -->
|
|
||||||
<jar>format_ranking_results.py</jar>
|
|
||||||
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
|
|
||||||
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkNormalExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
|
||||||
--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
|
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
</spark-opts>
|
|
||||||
|
|
||||||
<!-- Script arguments here -->
|
|
||||||
<arg>zenodo</arg>
|
|
||||||
<!-- Input files must be identified dynamically -->
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
|
|
||||||
<!-- Num partitions -->
|
|
||||||
<arg>${sparkShufflePartitions}</arg>
|
|
||||||
<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
|
|
||||||
<arg>openaire</arg>
|
|
||||||
<!-- This needs to point to the file on the hdfs i think -->
|
|
||||||
<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
|
|
||||||
</spark>
|
|
||||||
|
|
||||||
<ok to="join-file-formatting" />
|
|
||||||
<error to="bip-formatting-fail" />
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<!-- Finish formatting jobs -->
|
|
||||||
<join name="join-file-formatting" to="map-openaire-to-doi"/>
|
|
||||||
|
|
||||||
<!-- maps openaire ids to DOIs -->
|
|
||||||
<action name="map-openaire-to-doi">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
|
|
||||||
<!-- Delete previously created doi synonym folder -->
|
|
||||||
<prepare>
|
|
||||||
<delete path="${synonymFolder}"/>
|
|
||||||
</prepare>
|
|
||||||
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Openaire-DOI synonym collection</name>
|
|
||||||
<jar>map_openaire_ids_to_dois.py</jar>
|
|
||||||
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkHighDriverMemory}
|
|
||||||
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
</spark-opts>
|
|
||||||
|
|
||||||
<!-- Script arguments here -->
|
|
||||||
<arg>${openaireDataInput}/</arg>
|
|
||||||
<!-- number of partitions to be used on joins -->
|
|
||||||
<arg>${synonymFolder}</arg>
|
|
||||||
|
|
||||||
<file>${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
|
|
||||||
</spark>
|
|
||||||
|
|
||||||
<ok to="map-scores-to-dois" />
|
|
||||||
<error to="synonym-collection-fail" />
|
|
||||||
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<!-- mapping openaire scores to DOIs -->
|
|
||||||
<action name="map-scores-to-dois">
|
|
||||||
<!-- This is required as a tag for spark jobs, regardless of programming language -->
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
|
|
||||||
<!-- using configs from an example on openaire -->
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Mapping Openaire Scores to DOIs</name>
|
|
||||||
<jar>map_scores_to_dois.py</jar>
|
|
||||||
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkHighDriverMemory}
|
|
||||||
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
</spark-opts>
|
|
||||||
|
|
||||||
<!-- Script arguments here -->
|
|
||||||
<arg>${synonymFolder}</arg>
|
|
||||||
<!-- Number of partitions -->
|
|
||||||
<arg>${sparkShufflePartitions}</arg>
|
|
||||||
<!-- The remaining input are the ranking files fproduced for bip db-->
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
|
|
||||||
|
|
||||||
<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
|
|
||||||
</spark>
|
|
||||||
|
|
||||||
<ok to="project-impact-indicators" />
|
<ok to="project-impact-indicators" />
|
||||||
<error to="map-scores-fail" />
|
<error to="json-formatting-fail" />
|
||||||
|
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="project-impact-indicators">
|
<action name="project-impact-indicators">
|
||||||
|
@ -594,18 +457,6 @@
|
||||||
<message>Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<kill name="bip-formatting-fail">
|
|
||||||
<message>Error formatting BIP files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<kill name="synonym-collection-fail">
|
|
||||||
<message>Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<kill name="map-scores-fail">
|
|
||||||
<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<kill name="actionset-delete-fail">
|
<kill name="actionset-delete-fail">
|
||||||
<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
|
@ -3,8 +3,8 @@ package eu.dnetlib.dhp.swh.models;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||||
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
|
||||||
@JsonIgnoreProperties(ignoreUnknown = true)
|
@JsonIgnoreProperties(ignoreUnknown = true)
|
||||||
public class LastVisitData implements Serializable {
|
public class LastVisitData implements Serializable {
|
||||||
|
|
2
pom.xml
2
pom.xml
|
@ -937,7 +937,7 @@
|
||||||
<commons.logging.version>1.1.3</commons.logging.version>
|
<commons.logging.version>1.1.3</commons.logging.version>
|
||||||
<commons-validator.version>1.7</commons-validator.version>
|
<commons-validator.version>1.7</commons-validator.version>
|
||||||
<dateparser.version>1.0.7</dateparser.version>
|
<dateparser.version>1.0.7</dateparser.version>
|
||||||
<dhp-schemas.version>[8.0.1]</dhp-schemas.version>
|
<dhp-schemas.version>[9.0.0]</dhp-schemas.version>
|
||||||
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
||||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||||
<dhp.guava.version>11.0.2</dhp.guava.version>
|
<dhp.guava.version>11.0.2</dhp.guava.version>
|
||||||
|
|
Loading…
Reference in New Issue