forked from D-Net/dnet-hadoop
Merge remote-tracking branch 'origin/beta' into crossref_mapping_improvement
This commit is contained in:
commit
ac8995ab64
|
@ -28,3 +28,4 @@ spark-warehouse
|
||||||
/**/.scalafmt.conf
|
/**/.scalafmt.conf
|
||||||
/.java-version
|
/.java-version
|
||||||
/dhp-shade-package/dependency-reduced-pom.xml
|
/dhp-shade-package/dependency-reduced-pom.xml
|
||||||
|
/**/job.properties
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
package eu.dnetlib.dhp.oa.merge;
|
package eu.dnetlib.dhp.oa.merge;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import static org.apache.spark.sql.functions.col;
|
import static org.apache.spark.sql.functions.*;
|
||||||
import static org.apache.spark.sql.functions.when;
|
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
@ -135,7 +134,9 @@ public class GroupEntitiesSparkJob {
|
||||||
.applyCoarVocabularies(entity, vocs),
|
.applyCoarVocabularies(entity, vocs),
|
||||||
OAFENTITY_KRYO_ENC)
|
OAFENTITY_KRYO_ENC)
|
||||||
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
||||||
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
|
.mapGroups(
|
||||||
|
(MapGroupsFunction<String, OafEntity, OafEntity>) (key, group) -> MergeUtils.mergeById(group, vocs),
|
||||||
|
OAFENTITY_KRYO_ENC)
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||||
t.getClass().getName(), t),
|
t.getClass().getName(), t),
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.OPENAIRE_META_RESOURCE_TYPE;
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||||
|
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
|
@ -696,6 +695,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// set ORCID_PENDING to all orcid values that are not coming from ORCID provenance
|
||||||
for (Author a : r.getAuthor()) {
|
for (Author a : r.getAuthor()) {
|
||||||
if (Objects.isNull(a.getPid())) {
|
if (Objects.isNull(a.getPid())) {
|
||||||
a.setPid(Lists.newArrayList());
|
a.setPid(Lists.newArrayList());
|
||||||
|
@ -752,6 +752,40 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Identify clashing ORCIDS:that is same ORCID associated to multiple authors in this result
|
||||||
|
Map<String, Integer> clashing_orcid = new HashMap<>();
|
||||||
|
|
||||||
|
for (Author a : r.getAuthor()) {
|
||||||
|
a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
p -> StringUtils
|
||||||
|
.contains(StringUtils.lowerCase(p.getQualifier().getClassid()), ORCID_PENDING))
|
||||||
|
.map(StructuredProperty::getValue)
|
||||||
|
.distinct()
|
||||||
|
.forEach(orcid -> clashing_orcid.compute(orcid, (k, v) -> (v == null) ? 1 : v + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> clashing = clashing_orcid
|
||||||
|
.entrySet()
|
||||||
|
.stream()
|
||||||
|
.filter(ee -> ee.getValue() > 1)
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
// filter out clashing orcids
|
||||||
|
for (Author a : r.getAuthor()) {
|
||||||
|
a
|
||||||
|
.setPid(
|
||||||
|
a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(p -> !clashing.contains(p.getValue()))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
if (value instanceof Publication) {
|
if (value instanceof Publication) {
|
||||||
|
|
||||||
|
|
|
@ -23,24 +23,30 @@ import org.apache.commons.lang3.tuple.Pair;
|
||||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
||||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||||
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class MergeUtils {
|
public class MergeUtils {
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
|
public static <T extends Oaf> T mergeById(Iterator<T> oafEntityIterator, VocabularyGroup vocs) {
|
||||||
return mergeGroup(s, oafEntityIterator, true);
|
return mergeGroup(oafEntityIterator, true, vocs);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
|
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator) {
|
||||||
return mergeGroup(s, oafEntityIterator, false);
|
return mergeGroup(oafEntityIterator, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
|
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator, boolean checkDelegateAuthority) {
|
||||||
boolean checkDelegateAuthority) {
|
return mergeGroup(oafEntityIterator, checkDelegateAuthority, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator,
|
||||||
|
boolean checkDelegateAuthority, VocabularyGroup vocs) {
|
||||||
|
|
||||||
ArrayList<T> sortedEntities = new ArrayList<>();
|
ArrayList<T> sortedEntities = new ArrayList<>();
|
||||||
oafEntityIterator.forEachRemaining(sortedEntities::add);
|
oafEntityIterator.forEachRemaining(sortedEntities::add);
|
||||||
|
@ -49,13 +55,55 @@ public class MergeUtils {
|
||||||
Iterator<T> it = sortedEntities.iterator();
|
Iterator<T> it = sortedEntities.iterator();
|
||||||
T merged = it.next();
|
T merged = it.next();
|
||||||
|
|
||||||
|
if (!it.hasNext() && merged instanceof Result && vocs != null) {
|
||||||
|
return enforceResultType(vocs, (Result) merged);
|
||||||
|
} else {
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return merged;
|
return merged;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> T enforceResultType(VocabularyGroup vocs, Result mergedResult) {
|
||||||
|
if (Optional.ofNullable(mergedResult.getInstance()).map(List::isEmpty).orElse(true)) {
|
||||||
|
return (T) mergedResult;
|
||||||
|
} else {
|
||||||
|
final Instance i = mergedResult.getInstance().get(0);
|
||||||
|
|
||||||
|
if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
||||||
|
return (T) mergedResult;
|
||||||
|
} else {
|
||||||
|
final String expectedResultType = Optional
|
||||||
|
.ofNullable(
|
||||||
|
vocs
|
||||||
|
.lookupTermBySynonym(
|
||||||
|
ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid()))
|
||||||
|
.orElse(ModelConstants.ORP_DEFAULT_RESULTTYPE)
|
||||||
|
.getClassid();
|
||||||
|
|
||||||
|
// there is a clash among the result types
|
||||||
|
if (!expectedResultType.equals(mergedResult.getResulttype().getClassid())) {
|
||||||
|
|
||||||
|
Result result = (Result) Optional
|
||||||
|
.ofNullable(ModelSupport.oafTypes.get(expectedResultType))
|
||||||
|
.map(r -> {
|
||||||
|
try {
|
||||||
|
return r.newInstance();
|
||||||
|
} catch (InstantiationException | IllegalAccessException e) {
|
||||||
|
throw new IllegalStateException(e);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.orElse(new OtherResearchProduct());
|
||||||
|
result.setId(mergedResult.getId());
|
||||||
|
return (T) mergeResultFields(result, mergedResult);
|
||||||
|
} else {
|
||||||
|
return (T) mergedResult;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
|
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
|
||||||
return (T) merge(left, right, checkDelegateAuthority);
|
return (T) merge(left, right, checkDelegateAuthority);
|
||||||
}
|
}
|
||||||
|
@ -106,7 +154,7 @@ public class MergeUtils {
|
||||||
return mergeSoftware((Software) left, (Software) right);
|
return mergeSoftware((Software) left, (Software) right);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mergeResultFields((Result) left, (Result) right);
|
return left;
|
||||||
} else if (sameClass(left, right, Datasource.class)) {
|
} else if (sameClass(left, right, Datasource.class)) {
|
||||||
// TODO
|
// TODO
|
||||||
final int trust = compareTrust(left, right);
|
final int trust = compareTrust(left, right);
|
||||||
|
|
|
@ -104,22 +104,22 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
|
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
|
||||||
spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::crossref");
|
spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":crossref");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||||
spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::pubmed");
|
spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":pubmed");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
|
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
|
||||||
spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::openapc");
|
spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":openapc");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelationsNewModel(
|
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelationsNewModel(
|
||||||
spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::datacite");
|
spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":datacite");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelationsNewModel(
|
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelationsNewModel(
|
||||||
spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::rawaff");
|
spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":rawaff");
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisherNewModel(
|
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisherNewModel(
|
||||||
spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::webcrawl");
|
spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":webcrawl");
|
||||||
|
|
||||||
crossrefRelations
|
crossrefRelations
|
||||||
.union(pubmedRelations)
|
.union(pubmedRelations)
|
||||||
|
|
|
@ -15,6 +15,7 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
@ -193,8 +194,8 @@ public class ExtractPerson implements Serializable {
|
||||||
private static Relation getProjectRelation(String project, String orcid, String role) {
|
private static Relation getProjectRelation(String project, String orcid, String role) {
|
||||||
|
|
||||||
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
|
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
|
||||||
String target = PROJECT_ID_PREFIX + project.substring(0, 14)
|
String target = PROJECT_ID_PREFIX + StringUtils.substringBefore(project, "::") + "::"
|
||||||
+ IdentifierFactory.md5(project.substring(15));
|
+ IdentifierFactory.md5(StringUtils.substringAfter(project, "::"));
|
||||||
List<KeyValue> properties = new ArrayList<>();
|
List<KeyValue> properties = new ArrayList<>();
|
||||||
|
|
||||||
Relation relation = OafMapperUtils
|
Relation relation = OafMapperUtils
|
||||||
|
@ -345,7 +346,16 @@ public class ExtractPerson implements Serializable {
|
||||||
OafMapperUtils
|
OafMapperUtils
|
||||||
.structuredProperty(
|
.structuredProperty(
|
||||||
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
|
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
|
||||||
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
|
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES,
|
||||||
|
OafMapperUtils.dataInfo(false,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||||
|
ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES),
|
||||||
|
"0.91")));
|
||||||
person.setDateofcollection(op.getLastModifiedDate());
|
person.setDateofcollection(op.getLastModifiedDate());
|
||||||
person.setOriginalId(Arrays.asList(op.getOrcid()));
|
person.setOriginalId(Arrays.asList(op.getOrcid()));
|
||||||
person.setDataInfo(ORCIDDATAINFO);
|
person.setDataInfo(ORCIDDATAINFO);
|
||||||
|
|
|
@ -667,11 +667,12 @@ case object Crossref2Oaf {
|
||||||
val doi = input.getString(0)
|
val doi = input.getString(0)
|
||||||
val rorId = input.getString(1)
|
val rorId = input.getString(1)
|
||||||
|
|
||||||
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}"
|
|
||||||
|
val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true)
|
||||||
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
|
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
|
||||||
|
|
||||||
val r: Relation = new Relation
|
val r: Relation = new Relation
|
||||||
DoiCleaningRule.clean(doi)
|
|
||||||
r.setSource(pubId)
|
r.setSource(pubId)
|
||||||
r.setTarget(affId)
|
r.setTarget(affId)
|
||||||
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
||||||
|
@ -955,7 +956,26 @@ case object Crossref2Oaf {
|
||||||
case "10.13039/501100010790" =>
|
case "10.13039/501100010790" =>
|
||||||
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
|
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
|
||||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
|
//Add for Danish funders
|
||||||
|
//Independent Research Fund Denmark (IRFD)
|
||||||
|
case "10.13039/501100004836" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "irfd________", a => a)
|
||||||
|
val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
//Carlsberg Foundation (CF)
|
||||||
|
case "10.13039/501100002808" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "cf__________", a => a)
|
||||||
|
val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
//Novo Nordisk Foundation (NNF)
|
||||||
|
case "10.13039/501100009708" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "nnf___________", a => a)
|
||||||
|
val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -135,7 +135,7 @@ public class DedupRecordFactory {
|
||||||
return Collections.emptyIterator();
|
return Collections.emptyIterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator());
|
OafEntity mergedEntity = MergeUtils.mergeGroup(cliques.iterator());
|
||||||
// dedup records do not have date of transformation attribute
|
// dedup records do not have date of transformation attribute
|
||||||
mergedEntity.setDateoftransformation(null);
|
mergedEntity.setDateoftransformation(null);
|
||||||
mergedEntity
|
mergedEntity
|
||||||
|
|
|
@ -69,6 +69,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
|
|
||||||
Dataset<Relation> mergeRels = spark
|
Dataset<Relation> mergeRels = spark
|
||||||
.read()
|
.read()
|
||||||
|
.schema(REL_BEAN_ENC.schema())
|
||||||
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
|
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
|
||||||
.as(REL_BEAN_ENC);
|
.as(REL_BEAN_ENC);
|
||||||
|
|
||||||
|
|
|
@ -46,8 +46,8 @@ class DatasetMergerTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
|
void datasetMergerTest() {
|
||||||
Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
|
Dataset pub_merged = MergeUtils.mergeGroup(datasets.stream().map(Tuple2::_2).iterator());
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals(dedupId, pub_merged.getId());
|
assertEquals(dedupId, pub_merged.getId());
|
||||||
|
|
|
@ -566,7 +566,26 @@ case object Crossref2Oaf {
|
||||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
|
//Add for Danish funders
|
||||||
|
//Independent Research Fund Denmark (IRFD)
|
||||||
|
case "10.13039/501100004836" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "irfd________", a => a)
|
||||||
|
val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
//Carlsberg Foundation (CF)
|
||||||
|
case "10.13039/501100002808" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "cf__________", a => a)
|
||||||
|
val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
//Novo Nordisk Foundation (NNF)
|
||||||
|
case "10.13039/501100009708" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "nnf___________", a => a)
|
||||||
|
val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -153,34 +153,40 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final DataInfo entityInfo = prepareDataInfo(doc, this.invisible);
|
final DataInfo entityInfo = prepareDataInfo(doc, this.invisible);
|
||||||
final long lastUpdateTimestamp = new Date().getTime();
|
final long lastUpdateTimestamp = new Date().getTime();
|
||||||
|
|
||||||
final List<Instance> instances = prepareInstances(doc, entityInfo, collectedFrom, hostedBy);
|
final Instance instance = prepareInstances(doc, entityInfo, collectedFrom, hostedBy);
|
||||||
|
|
||||||
final String type = getResultType(doc, instances);
|
if (!Optional
|
||||||
|
.ofNullable(instance.getInstancetype())
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.isPresent()) {
|
||||||
|
return Lists.newArrayList();
|
||||||
|
}
|
||||||
|
|
||||||
return createOafs(doc, type, instances, collectedFrom, entityInfo, lastUpdateTimestamp);
|
final String type = getResultType(instance);
|
||||||
|
|
||||||
|
return createOafs(doc, type, instance, collectedFrom, entityInfo, lastUpdateTimestamp);
|
||||||
} catch (final DocumentException e) {
|
} catch (final DocumentException e) {
|
||||||
log.error("Error with record:\n" + xml);
|
log.error("Error with record:\n" + xml);
|
||||||
return Lists.newArrayList();
|
return Lists.newArrayList();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getResultType(final Document doc, final List<Instance> instances) {
|
protected String getResultType(final Instance instance) {
|
||||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
if (this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
||||||
|
|
||||||
if (StringUtils.isBlank(type) && this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
|
||||||
final String instanceType = instances
|
|
||||||
.stream()
|
|
||||||
.map(i -> i.getInstancetype().getClassid())
|
|
||||||
.findFirst()
|
|
||||||
.filter(s -> !UNKNOWN.equalsIgnoreCase(s))
|
|
||||||
.orElse("0000"); // Unknown
|
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
|
.ofNullable(instance.getInstancetype())
|
||||||
.map(Qualifier::getClassid)
|
.map(Qualifier::getClassid)
|
||||||
|
.map(
|
||||||
|
instanceType -> Optional
|
||||||
|
.ofNullable(
|
||||||
|
this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.orElse("0000"))
|
||||||
.orElse("0000");
|
.orElse("0000");
|
||||||
|
} else {
|
||||||
|
throw new IllegalStateException("Missing vocabulary: " + ModelConstants.DNET_RESULT_TYPOLOGIES);
|
||||||
}
|
}
|
||||||
|
|
||||||
return type;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
|
private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
|
||||||
|
@ -197,12 +203,12 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
protected List<Oaf> createOafs(
|
protected List<Oaf> createOafs(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final String type,
|
final String type,
|
||||||
final List<Instance> instances,
|
final Instance instance,
|
||||||
final KeyValue collectedFrom,
|
final KeyValue collectedFrom,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final long lastUpdateTimestamp) {
|
final long lastUpdateTimestamp) {
|
||||||
|
|
||||||
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
final OafEntity entity = createEntity(doc, type, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
|
|
||||||
final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
|
final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
|
||||||
originalId.add(entity.getId());
|
originalId.add(entity.getId());
|
||||||
|
@ -235,19 +241,19 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
private OafEntity createEntity(final Document doc,
|
private OafEntity createEntity(final Document doc,
|
||||||
final String type,
|
final String type,
|
||||||
final List<Instance> instances,
|
final Instance instance,
|
||||||
final KeyValue collectedFrom,
|
final KeyValue collectedFrom,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final long lastUpdateTimestamp) {
|
final long lastUpdateTimestamp) {
|
||||||
switch (type.toLowerCase()) {
|
switch (type.toLowerCase()) {
|
||||||
case "publication":
|
case "publication":
|
||||||
final Publication p = new Publication();
|
final Publication p = new Publication();
|
||||||
populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(p, doc, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
p.setJournal(prepareJournal(doc, info));
|
p.setJournal(prepareJournal(doc, info));
|
||||||
return p;
|
return p;
|
||||||
case "dataset":
|
case "dataset":
|
||||||
final Dataset d = new Dataset();
|
final Dataset d = new Dataset();
|
||||||
populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(d, doc, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
||||||
d.setDevice(prepareDatasetDevice(doc, info));
|
d.setDevice(prepareDatasetDevice(doc, info));
|
||||||
d.setSize(prepareDatasetSize(doc, info));
|
d.setSize(prepareDatasetSize(doc, info));
|
||||||
|
@ -258,7 +264,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
return d;
|
return d;
|
||||||
case "software":
|
case "software":
|
||||||
final Software s = new Software();
|
final Software s = new Software();
|
||||||
populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(s, doc, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
||||||
s.setLicense(prepareSoftwareLicenses(doc, info));
|
s.setLicense(prepareSoftwareLicenses(doc, info));
|
||||||
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
|
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
|
||||||
|
@ -268,7 +274,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
case "otherresearchproducts":
|
case "otherresearchproducts":
|
||||||
default:
|
default:
|
||||||
final OtherResearchProduct o = new OtherResearchProduct();
|
final OtherResearchProduct o = new OtherResearchProduct();
|
||||||
populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(o, doc, instance, collectedFrom, info, lastUpdateTimestamp);
|
||||||
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
||||||
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
||||||
o.setTool(prepareOtherResearchProductTools(doc, info));
|
o.setTool(prepareOtherResearchProductTools(doc, info));
|
||||||
|
@ -415,7 +421,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
private void populateResultFields(
|
private void populateResultFields(
|
||||||
final Result r,
|
final Result r,
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final List<Instance> instances,
|
final Instance instance,
|
||||||
final KeyValue collectedFrom,
|
final KeyValue collectedFrom,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final long lastUpdateTimestamp) {
|
final long lastUpdateTimestamp) {
|
||||||
|
@ -449,8 +455,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
r.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
r.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||||
r.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
r.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||||
r.setInstance(instances);
|
r.setInstance(Arrays.asList(instance));
|
||||||
r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));
|
r.setBestaccessright(OafMapperUtils.createBestAccessRights(Arrays.asList(instance)));
|
||||||
r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info));
|
r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -509,7 +515,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
||||||
|
|
||||||
protected abstract List<Instance> prepareInstances(
|
protected abstract Instance prepareInstances(
|
||||||
Document doc,
|
Document doc,
|
||||||
DataInfo info,
|
DataInfo info,
|
||||||
KeyValue collectedfrom,
|
KeyValue collectedfrom,
|
||||||
|
|
|
@ -133,7 +133,7 @@ public class GenerateEntitiesApplication extends AbstractMigrationApplication {
|
||||||
inputRdd
|
inputRdd
|
||||||
.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
|
.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
|
||||||
.groupByKey()
|
.groupByKey()
|
||||||
.map(t -> MergeUtils.mergeGroup(t._1, t._2.iterator())),
|
.map(t -> MergeUtils.mergeGroup(t._2.iterator())),
|
||||||
// .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
|
// .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
|
||||||
// .reduceByKey(MergeUtils::merge)
|
// .reduceByKey(MergeUtils::merge)
|
||||||
// .map(Tuple2::_2),
|
// .map(Tuple2::_2),
|
||||||
|
|
|
@ -135,7 +135,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Instance> prepareInstances(
|
protected Instance prepareInstances(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final KeyValue collectedfrom,
|
final KeyValue collectedfrom,
|
||||||
|
@ -197,7 +197,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
instance.getUrl().addAll(validUrl);
|
instance.getUrl().addAll(validUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Lists.newArrayList(instance);
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -126,7 +126,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Instance> prepareInstances(
|
protected Instance prepareInstances(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final KeyValue collectedfrom,
|
final KeyValue collectedfrom,
|
||||||
|
@ -210,7 +210,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
instance.setUrl(new ArrayList<>());
|
instance.setUrl(new ArrayList<>());
|
||||||
instance.getUrl().addAll(validUrl);
|
instance.getUrl().addAll(validUrl);
|
||||||
}
|
}
|
||||||
return Arrays.asList(instance);
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String trimAndDecodeUrl(String url) {
|
protected String trimAndDecodeUrl(String url) {
|
||||||
|
|
|
@ -51,6 +51,7 @@
|
||||||
<arg>--orcidPath</arg><arg>${orcidPath}</arg>
|
<arg>--orcidPath</arg><arg>${orcidPath}</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
||||||
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="reset_outputpath"/>
|
<ok to="reset_outputpath"/>
|
||||||
|
|
|
@ -162,6 +162,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=15000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
||||||
|
@ -197,6 +198,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=8000
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
||||||
|
@ -232,6 +234,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=5000
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
||||||
|
@ -267,6 +270,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=2000
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
||||||
|
@ -302,6 +306,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=1000
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
||||||
|
@ -337,6 +342,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=1000
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
||||||
|
@ -372,6 +378,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=2000
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
||||||
|
@ -407,6 +414,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=2000
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/person</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/person</arg>
|
||||||
|
@ -442,6 +450,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||||
--conf spark.sql.shuffle.partitions=20000
|
--conf spark.sql.shuffle.partitions=20000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
||||||
|
|
|
@ -133,7 +133,7 @@ object SparkCreateInputGraph {
|
||||||
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
|
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
|
||||||
|
|
||||||
ds.groupByKey(_.getId)
|
ds.groupByKey(_.getId)
|
||||||
.mapGroups { (id, it) => MergeUtils.mergeGroup(id, it.asJava).asInstanceOf[T] }
|
.mapGroups { (id, it) => MergeUtils.mergeGroup(it.asJava).asInstanceOf[T] }
|
||||||
// .reduceGroups { (x: T, y: T) => MergeUtils.merge(x, y).asInstanceOf[T] }
|
// .reduceGroups { (x: T, y: T) => MergeUtils.merge(x, y).asInstanceOf[T] }
|
||||||
// .map(_)
|
// .map(_)
|
||||||
.write
|
.write
|
||||||
|
|
Loading…
Reference in New Issue