forked from D-Net/dnet-hadoop
merged conflicts on beta
This commit is contained in:
commit
b0478c380e
|
@ -2,8 +2,7 @@
|
|||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.spark.sql.functions.col;
|
||||
import static org.apache.spark.sql.functions.when;
|
||||
import static org.apache.spark.sql.functions.*;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
@ -135,7 +134,9 @@ public class GroupEntitiesSparkJob {
|
|||
.applyCoarVocabularies(entity, vocs),
|
||||
OAFENTITY_KRYO_ENC)
|
||||
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, OafEntity, OafEntity>) (key, group) -> MergeUtils.mergeById(group, vocs),
|
||||
OAFENTITY_KRYO_ENC)
|
||||
.map(
|
||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||
t.getClass().getName(), t),
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.OPENAIRE_META_RESOURCE_TYPE;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -696,6 +695,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
}
|
||||
|
||||
// set ORCID_PENDING to all orcid values that are not coming from ORCID provenance
|
||||
for (Author a : r.getAuthor()) {
|
||||
if (Objects.isNull(a.getPid())) {
|
||||
a.setPid(Lists.newArrayList());
|
||||
|
@ -752,6 +752,40 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
// Identify clashing ORCIDS:that is same ORCID associated to multiple authors in this result
|
||||
Map<String, Integer> clashing_orcid = new HashMap<>();
|
||||
|
||||
for (Author a : r.getAuthor()) {
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(
|
||||
p -> StringUtils
|
||||
.contains(StringUtils.lowerCase(p.getQualifier().getClassid()), ORCID_PENDING))
|
||||
.map(StructuredProperty::getValue)
|
||||
.distinct()
|
||||
.forEach(orcid -> clashing_orcid.compute(orcid, (k, v) -> (v == null) ? 1 : v + 1));
|
||||
}
|
||||
|
||||
Set<String> clashing = clashing_orcid
|
||||
.entrySet()
|
||||
.stream()
|
||||
.filter(ee -> ee.getValue() > 1)
|
||||
.map(Map.Entry::getKey)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
// filter out clashing orcids
|
||||
for (Author a : r.getAuthor()) {
|
||||
a
|
||||
.setPid(
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(p -> !clashing.contains(p.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
|
|
|
@ -16,6 +16,8 @@ import java.util.function.Function;
|
|||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.ImmutablePair;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
@ -31,16 +33,20 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
|||
|
||||
public class MergeUtils {
|
||||
|
||||
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
|
||||
return mergeGroup(s, oafEntityIterator, true);
|
||||
public static <T extends Oaf> T mergeById(Iterator<T> oafEntityIterator, VocabularyGroup vocs) {
|
||||
return mergeGroup(oafEntityIterator, true, vocs);
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
|
||||
return mergeGroup(s, oafEntityIterator, false);
|
||||
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator) {
|
||||
return mergeGroup(oafEntityIterator, false);
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
|
||||
boolean checkDelegateAuthority) {
|
||||
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator, boolean checkDelegateAuthority) {
|
||||
return mergeGroup(oafEntityIterator, checkDelegateAuthority, null);
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator,
|
||||
boolean checkDelegateAuthority, VocabularyGroup vocs) {
|
||||
|
||||
ArrayList<T> sortedEntities = new ArrayList<>();
|
||||
oafEntityIterator.forEachRemaining(sortedEntities::add);
|
||||
|
@ -49,13 +55,54 @@ public class MergeUtils {
|
|||
Iterator<T> it = sortedEntities.iterator();
|
||||
T merged = it.next();
|
||||
|
||||
if (!it.hasNext() && merged instanceof Result && vocs != null) {
|
||||
return enforceResultType(vocs, (Result) merged);
|
||||
} else {
|
||||
while (it.hasNext()) {
|
||||
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
||||
}
|
||||
|
||||
}
|
||||
return merged;
|
||||
}
|
||||
|
||||
private static <T extends Oaf> T enforceResultType(VocabularyGroup vocs, Result mergedResult) {
|
||||
if (Optional.ofNullable(mergedResult.getInstance()).map(List::isEmpty).orElse(true)) {
|
||||
return (T) mergedResult;
|
||||
} else {
|
||||
final Instance i = mergedResult.getInstance().get(0);
|
||||
|
||||
if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
||||
return (T) mergedResult;
|
||||
} else {
|
||||
final Qualifier expectedResultType = vocs.lookupTermBySynonym(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
i.getInstancetype().getClassid());
|
||||
|
||||
if (Objects.isNull(expectedResultType)) {
|
||||
throw new IllegalArgumentException(
|
||||
"instance type not bound to any result type in dnet:result_typologies: " +
|
||||
i.getInstancetype().getClassid());
|
||||
}
|
||||
|
||||
// there is a clash among the result types
|
||||
if (!expectedResultType.getClassid().equals(mergedResult.getResulttype().getClassid())) {
|
||||
try {
|
||||
String resulttype = expectedResultType.getClassid();
|
||||
if (EntityType.otherresearchproduct.toString().equals(resulttype)) {
|
||||
resulttype = "other";
|
||||
}
|
||||
Result result = (Result) ModelSupport.oafTypes.get(resulttype).newInstance();
|
||||
return (T) mergeResultFields(result, mergedResult);
|
||||
} catch (InstantiationException | IllegalAccessException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
} else {
|
||||
return (T) mergedResult;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
|
||||
return (T) merge(left, right, checkDelegateAuthority);
|
||||
}
|
||||
|
@ -106,7 +153,7 @@ public class MergeUtils {
|
|||
return mergeSoftware((Software) left, (Software) right);
|
||||
}
|
||||
|
||||
return mergeResultFields((Result) left, (Result) right);
|
||||
return left;
|
||||
} else if (sameClass(left, right, Datasource.class)) {
|
||||
// TODO
|
||||
final int trust = compareTrust(left, right);
|
||||
|
|
|
@ -104,22 +104,22 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||
|
||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::crossref");
|
||||
spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":crossref");
|
||||
|
||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||
spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::pubmed");
|
||||
spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":pubmed");
|
||||
|
||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::openapc");
|
||||
spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":openapc");
|
||||
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::datacite");
|
||||
spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":datacite");
|
||||
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::rawaff");
|
||||
spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":rawaff");
|
||||
|
||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisherNewModel(
|
||||
spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::webcrawl");
|
||||
spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":webcrawl");
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
|
|
|
@ -193,8 +193,8 @@ public class ExtractPerson implements Serializable {
|
|||
private static Relation getProjectRelation(String project, String orcid, String role) {
|
||||
|
||||
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
|
||||
String target = PROJECT_ID_PREFIX + project.substring(0, 14)
|
||||
+ IdentifierFactory.md5(project.substring(15));
|
||||
String target = PROJECT_ID_PREFIX + StringUtils.substringBefore(project, "::") + "::"
|
||||
+ IdentifierFactory.md5(StringUtils.substringAfter(project, "::"));
|
||||
List<KeyValue> properties = new ArrayList<>();
|
||||
|
||||
Relation relation = OafMapperUtils
|
||||
|
@ -345,7 +345,16 @@ public class ExtractPerson implements Serializable {
|
|||
OafMapperUtils
|
||||
.structuredProperty(
|
||||
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
|
||||
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
|
||||
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES,
|
||||
OafMapperUtils.dataInfo(false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||
ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES),
|
||||
"0.91")));
|
||||
person.setDateofcollection(op.getLastModifiedDate());
|
||||
person.setOriginalId(Arrays.asList(op.getOrcid()));
|
||||
person.setDataInfo(ORCIDDATAINFO);
|
||||
|
|
|
@ -135,7 +135,7 @@ public class DedupRecordFactory {
|
|||
return Collections.emptyIterator();
|
||||
}
|
||||
|
||||
OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator());
|
||||
OafEntity mergedEntity = MergeUtils.mergeGroup(cliques.iterator());
|
||||
// dedup records do not have date of transformation attribute
|
||||
mergedEntity.setDateoftransformation(null);
|
||||
mergedEntity
|
||||
|
|
|
@ -69,6 +69,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
|||
|
||||
Dataset<Relation> mergeRels = spark
|
||||
.read()
|
||||
.schema(REL_BEAN_ENC.schema())
|
||||
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
|
||||
.as(REL_BEAN_ENC);
|
||||
|
||||
|
|
|
@ -46,8 +46,8 @@ class DatasetMergerTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
|
||||
Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
|
||||
void datasetMergerTest() {
|
||||
Dataset pub_merged = MergeUtils.mergeGroup(datasets.stream().map(Tuple2::_2).iterator());
|
||||
|
||||
// verify id
|
||||
assertEquals(dedupId, pub_merged.getId());
|
||||
|
|
|
@ -155,7 +155,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
final List<Instance> instances = prepareInstances(doc, entityInfo, collectedFrom, hostedBy);
|
||||
|
||||
final String type = getResultType(doc, instances);
|
||||
final String type = getResultType(instances);
|
||||
|
||||
return createOafs(doc, type, instances, collectedFrom, entityInfo, lastUpdateTimestamp);
|
||||
} catch (final DocumentException e) {
|
||||
|
@ -164,10 +164,9 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
}
|
||||
}
|
||||
|
||||
protected String getResultType(final Document doc, final List<Instance> instances) {
|
||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
||||
protected String getResultType(final List<Instance> instances) {
|
||||
|
||||
if (StringUtils.isBlank(type) && this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
||||
if (this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
||||
final String instanceType = instances
|
||||
.stream()
|
||||
.map(i -> i.getInstancetype().getClassid())
|
||||
|
@ -178,9 +177,9 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
.ofNullable(this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse("0000");
|
||||
} else {
|
||||
throw new IllegalStateException("Missing vocabulary: " + ModelConstants.DNET_RESULT_TYPOLOGIES);
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
|
||||
|
|
|
@ -133,7 +133,7 @@ public class GenerateEntitiesApplication extends AbstractMigrationApplication {
|
|||
inputRdd
|
||||
.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
|
||||
.groupByKey()
|
||||
.map(t -> MergeUtils.mergeGroup(t._1, t._2.iterator())),
|
||||
.map(t -> MergeUtils.mergeGroup(t._2.iterator())),
|
||||
// .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
|
||||
// .reduceByKey(MergeUtils::merge)
|
||||
// .map(Tuple2::_2),
|
||||
|
|
|
@ -51,6 +51,7 @@
|
|||
<arg>--orcidPath</arg><arg>${orcidPath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="reset_outputpath"/>
|
||||
|
|
|
@ -133,7 +133,7 @@ object SparkCreateInputGraph {
|
|||
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
|
||||
|
||||
ds.groupByKey(_.getId)
|
||||
.mapGroups { (id, it) => MergeUtils.mergeGroup(id, it.asJava).asInstanceOf[T] }
|
||||
.mapGroups { (id, it) => MergeUtils.mergeGroup(it.asJava).asInstanceOf[T] }
|
||||
// .reduceGroups { (x: T, y: T) => MergeUtils.merge(x, y).asInstanceOf[T] }
|
||||
// .map(_)
|
||||
.write
|
||||
|
|
Loading…
Reference in New Issue