1
0
Fork 0

Merge branch 'beta' into distinct_pids_from_openorgs_beta

This commit is contained in:
Claudio Atzori 2023-06-12 09:58:21 +02:00
commit ad04f14b81
39 changed files with 3031 additions and 452 deletions

View File

@ -57,7 +57,10 @@ public class PropagationConstant {
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository"; public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository";
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID = "result:organization:semrel"; public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID = "result:organization:semrel";
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME = "Propagation of affiliation to result through sematic relations"; public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME = "Propagation of affiliation to result through semantic relations";
public static final String PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID = "project:organization:semrel";
public static final String PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME = "Propagation of participation to project through semantic relations";
public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel"; public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel";
public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation"; public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation";
@ -171,6 +174,41 @@ public class PropagationConstant {
return newRelations; return newRelations;
} }
public static Relation getRelation(String source, String target, String rel_class) {
if (ModelConstants.HAS_PARTICIPANT.equals(rel_class)) {
return getParticipantRelation(source, target, rel_class);
} else
return getAffiliationRelation(source, target, rel_class);
}
public static Relation getParticipantRelation(
String source,
String target,
String rel_class) {
return getRelation(
source, target,
rel_class,
ModelConstants.PROJECT_ORGANIZATION,
ModelConstants.PARTICIPATION,
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID,
PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME);
}
public static Relation getAffiliationRelation(
String source,
String target,
String rel_class) {
return getRelation(
source, target,
rel_class,
ModelConstants.RESULT_ORGANIZATION,
ModelConstants.AFFILIATION,
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID,
PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME);
}
public static Relation getRelation( public static Relation getRelation(
String source, String source,
String target, String target,

View File

@ -15,6 +15,7 @@ public class Community implements Serializable {
private List<Provider> providers = new ArrayList<>(); private List<Provider> providers = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>(); private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
private SelectionConstraints constraints = new SelectionConstraints(); private SelectionConstraints constraints = new SelectionConstraints();
private SelectionConstraints removeConstraints = new SelectionConstraints();
public String toJson() { public String toJson() {
final Gson g = new Gson(); final Gson g = new Gson();
@ -67,4 +68,12 @@ public class Community implements Serializable {
public void setConstraints(SelectionConstraints constraints) { public void setConstraints(SelectionConstraints constraints) {
this.constraints = constraints; this.constraints = constraints;
} }
public SelectionConstraints getRemoveConstraints() {
return removeConstraints;
}
public void setRemoveConstraints(SelectionConstraints removeConstraints) {
this.removeConstraints = removeConstraints;
}
} }

View File

@ -28,6 +28,8 @@ public class CommunityConfiguration implements Serializable {
private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>(); private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>();
// map eosc datasource -> communityid // map eosc datasource -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>(); private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>();
// map communityid -> remove constraints
private Map<String, SelectionConstraints> removeConstraintsMap = new HashMap<>();
public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() { public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() {
return eoscDatasourceMap; return eoscDatasourceMap;
@ -71,6 +73,14 @@ public class CommunityConfiguration implements Serializable {
this.selectionConstraintsMap = selectionConstraintsMap; this.selectionConstraintsMap = selectionConstraintsMap;
} }
public Map<String, SelectionConstraints> getRemoveConstraintsMap() {
return removeConstraintsMap;
}
public void setRemoveConstraintsMap(Map<String, SelectionConstraints> removeConstraintsMap) {
this.removeConstraintsMap = removeConstraintsMap;
}
CommunityConfiguration(final Map<String, Community> communities) { CommunityConfiguration(final Map<String, Community> communities) {
this.communities = communities; this.communities = communities;
init(); init();
@ -90,6 +100,9 @@ public class CommunityConfiguration implements Serializable {
if (selectionConstraintsMap == null) { if (selectionConstraintsMap == null) {
selectionConstraintsMap = Maps.newHashMap(); selectionConstraintsMap = Maps.newHashMap();
} }
if (removeConstraintsMap == null) {
removeConstraintsMap = Maps.newHashMap();
}
for (Community c : getCommunities().values()) { for (Community c : getCommunities().values()) {
// get subjects // get subjects
@ -111,6 +124,8 @@ public class CommunityConfiguration implements Serializable {
zenodocommunityMap); zenodocommunityMap);
} }
selectionConstraintsMap.put(id, c.getConstraints()); selectionConstraintsMap.put(id, c.getConstraints());
removeConstraintsMap.put(id, c.getRemoveConstraints());
} }
} }

View File

@ -86,6 +86,7 @@ public class CommunityConfigurationFactory {
c.setProviders(parseDatasources(node)); c.setProviders(parseDatasources(node));
c.setZenodoCommunities(parseZenodoCommunities(node)); c.setZenodoCommunities(parseZenodoCommunities(node));
c.setConstraints(parseConstrains(node)); c.setConstraints(parseConstrains(node));
c.setRemoveConstraints(parseRemoveConstrains(node));
return c; return c;
} }
@ -102,6 +103,19 @@ public class CommunityConfigurationFactory {
return selectionConstraints; return selectionConstraints;
} }
private static SelectionConstraints parseRemoveConstrains(Node node) {
Node constsNode = node.selectSingleNode("./removeConstraints");
if (constsNode == null || StringUtils.isBlank(StringUtils.trim(constsNode.getText()))) {
return new SelectionConstraints();
}
SelectionConstraints selectionConstraints = new Gson()
.fromJson(constsNode.getText(), SelectionConstraints.class);
selectionConstraints.setSelection(resolver);
log.info("number of selection constraints set " + selectionConstraints.getCriteria().size());
return selectionConstraints;
}
private static List<String> parseSubjects(final Node node) { private static List<String> parseSubjects(final Node node) {
final List<String> subjects = Lists.newArrayList(); final List<String> subjects = Lists.newArrayList();

View File

@ -79,6 +79,23 @@ public class ResultTagger implements Serializable {
break; break;
} }
// communities contains all the communities to be not added to the context
final Set<String> removeCommunities = new HashSet<>();
conf
.getRemoveConstraintsMap()
.keySet()
.forEach(communityId -> {
if (conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
conf
.getRemoveConstraintsMap()
.get(communityId)
.getCriteria()
.stream()
.anyMatch(crit -> crit.verifyCriteria(param)))
removeCommunities.add(communityId);
});
// communities contains all the communities to be added as context for the result // communities contains all the communities to be added as context for the result
final Set<String> communities = new HashSet<>(); final Set<String> communities = new HashSet<>();
@ -164,7 +181,8 @@ public class ResultTagger implements Serializable {
.getSelectionConstraintsMap() .getSelectionConstraintsMap()
.keySet() .keySet()
.forEach(communityId -> { .forEach(communityId -> {
if (conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null && if (!removeCommunities.contains(communityId) &&
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
conf conf
.getSelectionConstraintsMap() .getSelectionConstraintsMap()
.get(communityId) .get(communityId)
@ -175,6 +193,9 @@ public class ResultTagger implements Serializable {
}); });
communities.addAll(aconstraints); communities.addAll(aconstraints);
communities.removeAll(removeCommunities);
if (aconstraints.size() > 0) if (aconstraints.size() > 0)
log.info("Found {} for advancedConstraints ", aconstraints.size()); log.info("Found {} for advancedConstraints ", aconstraints.size());

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.resulttoorganizationfromsemrel; package eu.dnetlib.dhp.entitytoorganizationfromsemrel;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.resulttoorganizationfromsemrel; package eu.dnetlib.dhp.entitytoorganizationfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
@ -47,13 +47,20 @@ public class PrepareInfo implements Serializable {
"' and datainfo.deletedbyinference = false " + "' and datainfo.deletedbyinference = false " +
"GROUP BY source"; "GROUP BY source";
// associate projects to all the participant orgs
private static final String PROJECT_ORGANIZATION_QUERY = "SELECT source key, collect_set(target) as valueSet " +
"FROM relation " +
"WHERE lower(relclass) = '" + ModelConstants.HAS_PARTICIPANT.toLowerCase() +
"' and datainfo.deletedbyinference = false " +
"GROUP BY source";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
SparkResultToOrganizationFromIstRepoJob.class SparkResultToOrganizationFromIstRepoJob.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_preparation_parameter.json")); "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -74,6 +81,9 @@ public class PrepareInfo implements Serializable {
final String resultOrganizationPath = parser.get("resultOrgPath"); final String resultOrganizationPath = parser.get("resultOrgPath");
log.info("resultOrganizationPath: {}", resultOrganizationPath); log.info("resultOrganizationPath: {}", resultOrganizationPath);
final String projectOrgPath = parser.get("projectOrganizationPath");
log.info("projectOrgPath: {}", projectOrgPath);
final String relationPath = parser.get("relationPath"); final String relationPath = parser.get("relationPath");
log.info("relationPath: {}", relationPath); log.info("relationPath: {}", relationPath);
@ -89,11 +99,13 @@ public class PrepareInfo implements Serializable {
childParentPath, childParentPath,
leavesPath, leavesPath,
resultOrganizationPath, resultOrganizationPath,
projectOrgPath,
relationPath)); relationPath));
} }
private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath, private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath,
String currentIterationPath, String resultOrganizationPath, String relationPath) { String currentIterationPath, String resultOrganizationPath, String projectOrganizationPath,
String relationPath) {
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class); Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
relation.createOrReplaceTempView("relation"); relation.createOrReplaceTempView("relation");
@ -113,6 +125,14 @@ public class PrepareInfo implements Serializable {
.option("compression", "gzip") .option("compression", "gzip")
.json(resultOrganizationPath); .json(resultOrganizationPath);
spark
.sql(PROJECT_ORGANIZATION_QUERY)
.as(Encoders.bean(KeyValueSet.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(projectOrganizationPath);
relation relation
.filter( .filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() && (FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
@ -120,7 +140,16 @@ public class PrepareInfo implements Serializable {
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(relationPath); .json(relationPath + "/result");
relation
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equals(ModelConstants.HAS_PARTICIPANT))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(relationPath + "/project");
Dataset<String> children = spark Dataset<String> children = spark
.sql( .sql(

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.resulttoorganizationfromsemrel; package eu.dnetlib.dhp.entitytoorganizationfromsemrel;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.resulttoorganizationfromsemrel; package eu.dnetlib.dhp.entitytoorganizationfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
@ -30,7 +30,8 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
public class SparkResultToOrganizationFromSemRel implements Serializable { public class SparkResultToOrganizationFromSemRel implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromSemRel.class); private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromSemRel.class);
private static final int MAX_ITERATION = 5; private static final int MAX_ITERATION = 5;
public static final String NEW_RELATION_PATH = "/newRelation"; public static final String NEW_RESULT_RELATION_PATH = "/newResultRelation";
public static final String NEW_PROJECT_RELATION_PATH = "/newProjectRelation";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -38,7 +39,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
.toString( .toString(
SparkResultToOrganizationFromIstRepoJob.class SparkResultToOrganizationFromIstRepoJob.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_propagation_parameter.json")); "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -62,6 +63,9 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
final String resultOrganizationPath = parser.get("resultOrgPath"); final String resultOrganizationPath = parser.get("resultOrgPath");
log.info("resultOrganizationPath: {}", resultOrganizationPath); log.info("resultOrganizationPath: {}", resultOrganizationPath);
final String projectOrganizationPath = parser.get("projectOrganizationPath");
log.info("projectOrganizationPath: {}", projectOrganizationPath);
final String workingPath = parser.get("workingDir"); final String workingPath = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingPath: {}", workingPath);
@ -88,6 +92,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
leavesPath, leavesPath,
childParentPath, childParentPath,
resultOrganizationPath, resultOrganizationPath,
projectOrganizationPath,
relationPath, relationPath,
workingPath, workingPath,
outputPath, outputPath,
@ -98,13 +103,14 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
String leavesPath, String leavesPath,
String childParentPath, String childParentPath,
String resultOrganizationPath, String resultOrganizationPath,
String projectOrganizationPath,
String graphPath, String graphPath,
String workingPath, String workingPath,
String outputPath, String outputPath,
int iterations) { int iterations) {
if (iterations == 1) { if (iterations == 1) {
doPropagateOnce( doPropagateOnce(
spark, leavesPath, childParentPath, resultOrganizationPath, graphPath, spark, leavesPath, childParentPath, resultOrganizationPath, projectOrganizationPath, graphPath,
workingPath, outputPath); workingPath, outputPath);
} else { } else {
@ -123,26 +129,34 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
notReachedFirstParent); notReachedFirstParent);
doPropagate( doPropagate(
spark, leavesPath, childParentPath, resultOrganizationPath, graphPath, spark, leavesPath, childParentPath, resultOrganizationPath, projectOrganizationPath, graphPath,
workingPath, outputPath, propagationCounter); workingPath, outputPath, propagationCounter);
} }
} }
private static void doPropagateOnce(SparkSession spark, String leavesPath, String childParentPath, private static void doPropagateOnce(SparkSession spark, String leavesPath, String childParentPath,
String resultOrganizationPath, String graphPath, String workingPath, String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath,
String outputPath) { String outputPath) {
StepActions StepActions
.execStep( .execStep(
spark, graphPath, workingPath + NEW_RELATION_PATH, spark, graphPath + "/result", workingPath + NEW_RESULT_RELATION_PATH,
leavesPath, childParentPath, resultOrganizationPath); leavesPath, childParentPath, resultOrganizationPath, ModelConstants.HAS_AUTHOR_INSTITUTION);
addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath); addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath);
StepActions
.execStep(
spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH,
leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT);
addNewRelations(spark, workingPath + NEW_PROJECT_RELATION_PATH, outputPath);
} }
private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath, private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath,
String resultOrganizationPath, String graphPath, String workingPath, String outputPath, String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath,
String outputPath,
PropagationCounter propagationCounter) { PropagationCounter propagationCounter) {
int iteration = 0; int iteration = 0;
long leavesCount; long leavesCount;
@ -151,13 +165,18 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
iteration++; iteration++;
StepActions StepActions
.execStep( .execStep(
spark, graphPath, workingPath + NEW_RELATION_PATH, spark, graphPath + "/result", workingPath + NEW_RESULT_RELATION_PATH,
leavesPath, childParentPath, resultOrganizationPath); leavesPath, childParentPath, resultOrganizationPath, ModelConstants.HAS_AUTHOR_INSTITUTION);
StepActions
.execStep(
spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH,
leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT);
StepActions StepActions
.prepareForNextStep( .prepareForNextStep(
spark, workingPath + NEW_RELATION_PATH, resultOrganizationPath, leavesPath, spark, workingPath, resultOrganizationPath, projectOrganizationPath, leavesPath,
childParentPath, workingPath + "/leaves", workingPath + "/resOrg"); childParentPath, workingPath + "/leaves", workingPath + "/resOrg", workingPath + "/projOrg");
moveOutput(spark, workingPath, leavesPath, resultOrganizationPath); moveOutput(spark, workingPath, leavesPath, resultOrganizationPath, projectOrganizationPath);
leavesCount = readPath(spark, leavesPath, Leaves.class).count(); leavesCount = readPath(spark, leavesPath, Leaves.class).count();
} while (leavesCount > 0 && iteration < MAX_ITERATION); } while (leavesCount > 0 && iteration < MAX_ITERATION);
@ -185,7 +204,8 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
propagationCounter.getNotReachedFirstParent().add(1); propagationCounter.getNotReachedFirstParent().add(1);
} }
addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath); addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath);
addNewRelations(spark, workingPath + NEW_PROJECT_RELATION_PATH, outputPath);
} }
private static void moveOutput(SparkSession spark, String workingPath, String leavesPath, private static void moveOutput(SparkSession spark, String workingPath, String leavesPath,
@ -204,6 +224,28 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
} }
private static void moveOutput(SparkSession spark, String workingPath, String leavesPath,
String resultOrganizationPath, String projectOrganizationPath) {
readPath(spark, workingPath + "/leaves", Leaves.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(leavesPath);
readPath(spark, workingPath + "/resOrg", KeyValueSet.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(resultOrganizationPath);
readPath(spark, workingPath + "/projOrg", KeyValueSet.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(projectOrganizationPath);
}
private static void addNewRelations(SparkSession spark, String newRelationPath, String outputPath) { private static void addNewRelations(SparkSession spark, String newRelationPath, String outputPath) {
Dataset<Relation> relation = readPath(spark, newRelationPath, Relation.class); Dataset<Relation> relation = readPath(spark, newRelationPath, Relation.class);
@ -212,16 +254,21 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
.mapGroups( .mapGroups(
(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(), Encoders.bean(Relation.class)) (MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(), Encoders.bean(Relation.class))
.flatMap( .flatMap(
(FlatMapFunction<Relation, Relation>) r -> Arrays (FlatMapFunction<Relation, Relation>) r -> {
.asList( if (r.getSource().startsWith("50|")) {
r, getRelation( return Arrays
r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF, .asList(
ModelConstants.RESULT_ORGANIZATION, r, getAffiliationRelation(
ModelConstants.AFFILIATION, r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF))
PROPAGATION_DATA_INFO_TYPE, .iterator();
PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, } else {
PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME)) return Arrays
.iterator() .asList(
r, getParticipantRelation(
r.getTarget(), r.getSource(), ModelConstants.IS_PARTICIPANT))
.iterator();
}
}
, Encoders.bean(Relation.class)) , Encoders.bean(Relation.class))
.write() .write()

View File

@ -1,8 +1,10 @@
package eu.dnetlib.dhp.resulttoorganizationfromsemrel; package eu.dnetlib.dhp.entitytoorganizationfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.PropagationConstant.readPath; import static eu.dnetlib.dhp.PropagationConstant.readPath;
import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel.NEW_PROJECT_RELATION_PATH;
import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel.NEW_RESULT_RELATION_PATH;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
@ -14,8 +16,6 @@ import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
@ -28,13 +28,14 @@ public class StepActions implements Serializable {
public static void execStep(SparkSession spark, public static void execStep(SparkSession spark,
String graphPath, String newRelationPath, String graphPath, String newRelationPath,
String leavesPath, String chldParentOrgPath, String resultOrgPath) { String leavesPath, String chldParentOrgPath, String entityOrgPath, String rel_class) {
Dataset<Relation> relationGraph = readPath(spark, graphPath, Relation.class); Dataset<Relation> relationGraph = readPath(spark, graphPath, Relation.class);
// select only the relation source target among those proposed by propagation that are not already existent // select only the relation source target among those proposed by propagation that are not already existent
getNewRels( getNewRels(
newRelationPath, relationGraph, newRelationPath, relationGraph,
getPropagationRelation(spark, leavesPath, chldParentOrgPath, resultOrgPath)); getPropagationRelation(spark, leavesPath, chldParentOrgPath, entityOrgPath, rel_class));
} }
@ -45,16 +46,33 @@ public class StepActions implements Serializable {
changeLeavesSet(spark, leavesPath, chldParentOrgPath, leavesOutputPath); changeLeavesSet(spark, leavesPath, chldParentOrgPath, leavesOutputPath);
// add the new relations obtained from propagation to the keyvalueset result organization // add the new relations obtained from propagation to the keyvalueset result organization
updateResultOrganization( updateEntityOrganization(
spark, resultOrgPath, readPath(spark, selectedRelsPath, Relation.class), orgOutputPath); spark, resultOrgPath, readPath(spark, selectedRelsPath, Relation.class), orgOutputPath);
} }
private static void updateResultOrganization(SparkSession spark, String resultOrgPath, public static void prepareForNextStep(SparkSession spark, String selectedRelsPath, String resultOrgPath,
String projectOrgPath,
String leavesPath, String chldParentOrgPath, String leavesOutputPath,
String orgOutputPath, String outputProjectPath) {
// use of the parents as new leaves set
changeLeavesSet(spark, leavesPath, chldParentOrgPath, leavesOutputPath);
// add the new relations obtained from propagation to the keyvalueset result organization
updateEntityOrganization(
spark, resultOrgPath, readPath(spark, selectedRelsPath + NEW_RESULT_RELATION_PATH, Relation.class),
orgOutputPath);
updateEntityOrganization(
spark, projectOrgPath, readPath(spark, selectedRelsPath + NEW_PROJECT_RELATION_PATH, Relation.class),
outputProjectPath);
}
private static void updateEntityOrganization(SparkSession spark, String entityOrgPath,
Dataset<Relation> selectedRels, String outputPath) { Dataset<Relation> selectedRels, String outputPath) {
Dataset<KeyValueSet> resultOrg = readPath(spark, resultOrgPath, KeyValueSet.class); Dataset<KeyValueSet> entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class);
resultOrg entityOrg
.joinWith( .joinWith(
selectedRels, resultOrg selectedRels, entityOrg
.col("key") .col("key")
.equalTo(selectedRels.col("source")), .equalTo(selectedRels.col("source")),
"left") "left")
@ -129,7 +147,12 @@ public class StepActions implements Serializable {
.getDataInfo() .getDataInfo()
.getProvenanceaction() .getProvenanceaction()
.getClassid() .getClassid()
.equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID)) .equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID)
&& !rel
.getDataInfo()
.getProvenanceaction()
.getClassid()
.equals(PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID))
.count() > 0) { .count() > 0) {
return null; return null;
} }
@ -152,19 +175,20 @@ public class StepActions implements Serializable {
private static Dataset<Relation> getPropagationRelation(SparkSession spark, private static Dataset<Relation> getPropagationRelation(SparkSession spark,
String leavesPath, String leavesPath,
String chldParentOrgPath, String chldParentOrgPath,
String resultOrgPath) { String entityOrgPath,
String semantics) {
Dataset<KeyValueSet> childParent = readPath(spark, chldParentOrgPath, KeyValueSet.class); Dataset<KeyValueSet> childParent = readPath(spark, chldParentOrgPath, KeyValueSet.class);
Dataset<KeyValueSet> resultOrg = readPath(spark, resultOrgPath, KeyValueSet.class); Dataset<KeyValueSet> entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class);
Dataset<Leaves> leaves = readPath(spark, leavesPath, Leaves.class); Dataset<Leaves> leaves = readPath(spark, leavesPath, Leaves.class);
childParent.createOrReplaceTempView("childParent"); childParent.createOrReplaceTempView("childParent");
resultOrg.createOrReplaceTempView("resultOrg"); entityOrg.createOrReplaceTempView("entityOrg");
leaves.createOrReplaceTempView("leaves"); leaves.createOrReplaceTempView("leaves");
Dataset<KeyValueSet> resultParent = spark Dataset<KeyValueSet> resultParent = spark
.sql( .sql(
"SELECT resId as key, " + "SELECT entityId as key, " +
"collect_set(parent) valueSet " + "collect_set(parent) valueSet " +
"FROM (SELECT key as child, parent " + "FROM (SELECT key as child, parent " +
" FROM childParent " + " FROM childParent " +
@ -172,14 +196,14 @@ public class StepActions implements Serializable {
"JOIN leaves " + "JOIN leaves " +
"ON leaves.value = cp.child " + "ON leaves.value = cp.child " +
"JOIN (" + "JOIN (" +
"SELECT key as resId, org " + "SELECT key as entityId, org " +
"FROM resultOrg " + "FROM entityOrg " +
"LATERAL VIEW explode (valueSet) ks as org ) as ro " + "LATERAL VIEW explode (valueSet) ks as org ) as ro " +
"ON leaves.value = ro.org " + "ON leaves.value = ro.org " +
"GROUP BY resId") "GROUP BY entityId")
.as(Encoders.bean(KeyValueSet.class)); .as(Encoders.bean(KeyValueSet.class));
// create new relations from result to organization for each result linked to a leaf // create new relations from entity to organization for each entity linked to a leaf
return resultParent return resultParent
.flatMap( .flatMap(
(FlatMapFunction<KeyValueSet, Relation>) v -> v (FlatMapFunction<KeyValueSet, Relation>) v -> v
@ -189,12 +213,7 @@ public class StepActions implements Serializable {
orgId -> getRelation( orgId -> getRelation(
v.getKey(), v.getKey(),
orgId, orgId,
ModelConstants.HAS_AUTHOR_INSTITUTION, semantics))
ModelConstants.RESULT_ORGANIZATION,
ModelConstants.AFFILIATION,
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID,
PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME))
.collect(Collectors.toList()) .collect(Collectors.toList())
.iterator(), .iterator(),
Encoders.bean(Relation.class)); Encoders.bean(Relation.class));

View File

@ -10,6 +10,9 @@ where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//con
return return
<community> <community>
{ $x//CONFIGURATION/context/@id} { $x//CONFIGURATION/context/@id}
<removeConstraints>
{$x//CONFIGURATION/context/param[./@name='removeConstraints']/text() }
</removeConstraints>
<advancedConstraints> <advancedConstraints>
{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() } {$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }
</advancedConstraints> </advancedConstraints>

View File

@ -40,5 +40,11 @@
"paramLongName": "relationPath", "paramLongName": "relationPath",
"paramDescription": "the path where to store the selected subset of relations", "paramDescription": "the path where to store the selected subset of relations",
"paramRequired": false "paramRequired": false
},
{
"paramName": "pop",
"paramLongName": "projectOrganizationPath",
"paramDescription": "the number of iterations to be computed",
"paramRequired": true
} }
] ]

View File

@ -52,5 +52,10 @@
"paramLongName": "iterations", "paramLongName": "iterations",
"paramDescription": "the number of iterations to be computed", "paramDescription": "the number of iterations to be computed",
"paramRequired": false "paramRequired": false
} },{
"paramName": "pop",
"paramLongName": "projectOrganizationPath",
"paramDescription": "the number of iterations to be computed",
"paramRequired": true
}
] ]

View File

@ -134,7 +134,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>PrepareResultOrganizationAssociation</name> <name>PrepareResultOrganizationAssociation</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.PrepareInfo</class> <class>eu.dnetlib.dhp.entitytoorganizationfromsemrel.PrepareInfo</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar> <jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
@ -150,6 +150,7 @@
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg> <arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg> <arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg> <arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
<arg>--projectOrganizationPath</arg><arg>${workingDir}/preparedInfo/projectOrganizationPath</arg>
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg> <arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
</spark> </spark>
<ok to="apply_resulttoorganization_propagation"/> <ok to="apply_resulttoorganization_propagation"/>
@ -161,7 +162,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>resultToOrganizationFromSemRel</name> <name>resultToOrganizationFromSemRel</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.SparkResultToOrganizationFromSemRel</class> <class>eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar> <jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}

View File

@ -39,8 +39,10 @@ public class BulkTagJobTest {
+ " \"contributor\" : \"$['contributor'][*]['value']\"," + " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\", " + " \"description\" : \"$['description'][*]['value']\", "
+ " \"subject\" :\"$['subject'][*]['value']\" , " + + " \"subject\" :\"$['subject'][*]['value']\" , " +
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"" + "\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"," +
"} "; "\"sdg\" : \"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"," +
"\"hostedby\" : \"$['instance'][*]['hostedby']['key']\" , " +
"\"collectedfrom\" : \"$['instance'][*]['collectedfrom']['key']\"} ";
private static SparkSession spark; private static SparkSession spark;
@ -56,7 +58,7 @@ public class BulkTagJobTest {
.toString( .toString(
BulkTagJobTest.class BulkTagJobTest.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_dth.xml")); "/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_remove.xml"));
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -1525,4 +1527,45 @@ public class BulkTagJobTest {
.count()); .count());
} }
@Test
void removeTest() throws Exception {
final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob
.main(
new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints")
.getPath(),
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(12, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from dataset "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'dth'").count());
}
} }

View File

@ -1,22 +1,17 @@
package eu.dnetlib.dhp.resulttoorganizationfromsemrel; package eu.dnetlib.dhp.entitytoorganizationfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.readPath;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.List;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
@ -28,7 +23,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.KeyValueSet; import eu.dnetlib.dhp.KeyValueSet;
import eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareInfoJobTest { public class PrepareInfoJobTest {
@ -78,11 +72,12 @@ public class PrepareInfoJobTest {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass() "-graphPath", getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1")
.getPath(), .getPath(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -223,11 +218,12 @@ public class PrepareInfoJobTest {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass() "-graphPath", getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest2") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest2")
.getPath(), .getPath(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -343,11 +339,12 @@ public class PrepareInfoJobTest {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass() "-graphPath", getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest")
.getPath(), .getPath(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -355,7 +352,38 @@ public class PrepareInfoJobTest {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation") .textFile(workingDir.toString() + "/relation/result")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Dataset<Relation> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
Assertions.assertEquals(7, verificationDs.count());
}
@Test
public void relationProjectTest() throws Exception {
PrepareInfo
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest")
.getPath(),
"-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation/project")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Dataset<Relation> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); Dataset<Relation> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
@ -373,11 +401,12 @@ public class PrepareInfoJobTest {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass() "-graphPath", getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest")
.getPath(), .getPath(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -498,6 +527,141 @@ public class PrepareInfoJobTest {
} }
@Test
public void projectOrganizationTest1() throws Exception {
PrepareInfo
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest")
.getPath(),
"-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<KeyValueSet> tmp = sc
.textFile(workingDir.toString() + "/projectOrganization/")
.map(item -> OBJECT_MAPPER.readValue(item, KeyValueSet.class));
Dataset<KeyValueSet> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(KeyValueSet.class));
Assertions.assertEquals(5, verificationDs.count());
Assertions
.assertEquals(
2, verificationDs
.filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'")
.collectAsList()
.get(0)
.getValueSet()
.size());
Assertions
.assertTrue(
verificationDs
.filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'")
.collectAsList()
.get(0)
.getValueSet()
.contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
verificationDs
.filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'")
.collectAsList()
.get(0)
.getValueSet()
.contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertEquals(
2, verificationDs
.filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'")
.collectAsList()
.get(0)
.getValueSet()
.size());
Assertions
.assertTrue(
verificationDs
.filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'")
.collectAsList()
.get(0)
.getValueSet()
.contains("20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"));
Assertions
.assertTrue(
verificationDs
.filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'")
.collectAsList()
.get(0)
.getValueSet()
.contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertEquals(
1, verificationDs
.filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'")
.collectAsList()
.get(0)
.getValueSet()
.size());
Assertions
.assertTrue(
verificationDs
.filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'")
.collectAsList()
.get(0)
.getValueSet()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertEquals(
1, verificationDs
.filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'")
.collectAsList()
.get(0)
.getValueSet()
.size());
Assertions
.assertTrue(
verificationDs
.filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'")
.collectAsList()
.get(0)
.getValueSet()
.contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions
.assertEquals(
1, verificationDs
.filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'")
.collectAsList()
.get(0)
.getValueSet()
.size());
Assertions
.assertTrue(
verificationDs
.filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'")
.collectAsList()
.get(0)
.getValueSet()
.contains("20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"));
verificationDs
.foreach((ForeachFunction<KeyValueSet>) v -> System.out.println(OBJECT_MAPPER.writeValueAsString(v)));
}
@Test @Test
public void foundLeavesTest1() throws Exception { public void foundLeavesTest1() throws Exception {
@ -507,11 +671,12 @@ public class PrepareInfoJobTest {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass() "-graphPath", getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest")
.getPath(), .getPath(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -534,11 +699,12 @@ public class PrepareInfoJobTest {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass() "-graphPath", getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1")
.getPath(), .getPath(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"

View File

@ -0,0 +1,900 @@
package eu.dnetlib.dhp.entitytoorganizationfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.readPath;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.KeyValueSet;
import eu.dnetlib.dhp.PropagationConstant;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class SparkJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(SparkJobTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(StepActionsTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(PrepareInfoJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(PrepareInfoJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void completeResultExecution() throws Exception {
final String graphPath = getClass()
.getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph")
.getPath();
final String leavesPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/")
.getPath();
final String childParentPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/")
.getPath();
final String resultOrgPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath();
final String projectOrgPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/")
.getPath();
readPath(spark, leavesPath, Leaves.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/leavesInput");
readPath(spark, resultOrgPath, KeyValueSet.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/orgsInput");
readPath(spark, projectOrgPath, KeyValueSet.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/projectInput");
SparkResultToOrganizationFromSemRel
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-relationPath", graphPath,
"-hive_metastore_uris", "",
"-outputPath", workingDir.toString() + "/finalrelation",
"-leavesPath", workingDir.toString() + "/leavesInput",
"-resultOrgPath", workingDir.toString() + "/orgsInput",
"-projectOrganizationPath", workingDir.toString() + "/projectInput",
"-childParentPath", childParentPath,
"-workingDir", workingDir.toString()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> temp = sc
.textFile(workingDir.toString() + "/finalrelation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(36, temp.count());
JavaRDD<Relation> result = temp.filter(r -> r.getSource().startsWith("50|") || r.getTarget().startsWith("50|"));
Assertions.assertEquals(18, result.count());
result.foreach(r -> Assertions.assertEquals(ModelConstants.AFFILIATION, r.getSubRelType()));
result.foreach(r -> Assertions.assertEquals(ModelConstants.RESULT_ORGANIZATION, r.getRelType()));
result
.foreach(
r -> Assertions
.assertEquals(
PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance()));
result
.foreach(
r -> Assertions
.assertEquals(
PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID,
r.getDataInfo().getProvenanceaction().getClassid()));
result
.foreach(
r -> Assertions
.assertEquals(
PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME,
r.getDataInfo().getProvenanceaction().getClassname()));
result
.foreach(
r -> Assertions
.assertEquals(
"0.85",
r.getDataInfo().getTrust()));
Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("50|")).count());
result
.filter(r -> r.getSource().substring(0, 3).equals("50|"))
.foreach(r -> Assertions.assertEquals(ModelConstants.HAS_AUTHOR_INSTITUTION, r.getRelClass()));
Assertions
.assertEquals(
2,
result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
3,
result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2,
result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
1,
result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
1,
result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("20|")).count());
result
.filter(r -> r.getSource().substring(0, 3).equals("20|"))
.foreach(r -> Assertions.assertEquals(ModelConstants.IS_AUTHOR_INSTITUTION_OF, r.getRelClass()));
Assertions
.assertEquals(
1,
result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
1,
result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2,
result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
2,
result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
3,
result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget())
.collect()
.contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget())
.collect()
.contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget())
.collect()
.contains("50|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
result
.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
}
@Test
public void completeProjectExecution() throws Exception {
final String graphPath = getClass()
.getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph")
.getPath();
final String leavesPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/")
.getPath();
final String childParentPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/")
.getPath();
final String resultOrgPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath();
final String projectOrgPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/")
.getPath();
readPath(spark, leavesPath, Leaves.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/leavesInput");
readPath(spark, resultOrgPath, KeyValueSet.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/orgsInput");
readPath(spark, projectOrgPath, KeyValueSet.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/projectInput");
SparkResultToOrganizationFromSemRel
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-relationPath", graphPath,
"-hive_metastore_uris", "",
"-outputPath", workingDir.toString() + "/finalrelation",
"-leavesPath", workingDir.toString() + "/leavesInput",
"-resultOrgPath", workingDir.toString() + "/orgsInput",
"-projectOrganizationPath", workingDir.toString() + "/projectInput",
"-childParentPath", childParentPath,
"-workingDir", workingDir.toString()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> temp = sc
.textFile(workingDir.toString() + "/finalrelation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(36, temp.count());
JavaRDD<Relation> project = temp
.filter(r -> r.getSource().startsWith("40|") || r.getTarget().startsWith("40|"));
Assertions.assertEquals(18, project.count());
project.foreach(r -> Assertions.assertEquals(ModelConstants.PARTICIPATION, r.getSubRelType()));
project.foreach(r -> Assertions.assertEquals(ModelConstants.PROJECT_ORGANIZATION, r.getRelType()));
project
.foreach(
r -> Assertions
.assertEquals(
PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance()));
project
.foreach(
r -> Assertions
.assertEquals(
PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID,
r.getDataInfo().getProvenanceaction().getClassid()));
project
.foreach(
r -> Assertions
.assertEquals(
PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME,
r.getDataInfo().getProvenanceaction().getClassname()));
project
.foreach(
r -> Assertions
.assertEquals(
"0.85",
r.getDataInfo().getTrust()));
Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("40|")).count());
project
.filter(r -> r.getSource().substring(0, 3).equals("40|"))
.foreach(r -> Assertions.assertEquals(ModelConstants.HAS_PARTICIPANT, r.getRelClass()));
Assertions
.assertEquals(
2,
project.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
3,
project.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2,
project.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
1,
project.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
1,
project.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("20|")).count());
project
.filter(r -> r.getSource().substring(0, 3).equals("20|"))
.foreach(r -> Assertions.assertEquals(ModelConstants.IS_PARTICIPANT, r.getRelClass()));
Assertions
.assertEquals(
1,
project.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
1,
project.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2,
project.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
2,
project.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
3,
project.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget())
.collect()
.contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget())
.collect()
.contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget())
.collect()
.contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget())
.collect()
.contains("40|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("40|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
project
.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget())
.collect()
.contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
}
@Test
public void singleIterationExecution() throws Exception {
final String graphPath = getClass()
.getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph")
.getPath();
final String leavesPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/")
.getPath();
final String childParentPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/")
.getPath();
final String resultOrgPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath();
final String projectOrgPath = getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/")
.getPath();
readPath(spark, leavesPath, Leaves.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/leavesInput");
readPath(spark, resultOrgPath, KeyValueSet.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/orgsInput");
readPath(spark, projectOrgPath, KeyValueSet.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/projectInput");
SparkResultToOrganizationFromSemRel
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-relationPath", graphPath,
"-hive_metastore_uris", "",
"-outputPath", workingDir.toString() + "/finalrelation",
"-leavesPath", workingDir.toString() + "/leavesInput",
"-resultOrgPath", workingDir.toString() + "/orgsInput",
"-projectOrganizationPath", workingDir.toString() + "/projectInput",
"-childParentPath", childParentPath,
"-workingDir", workingDir.toString(),
"-iterations", "1"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> temp = sc
.textFile(workingDir.toString() + "/finalrelation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(16, temp.count());
Assertions.assertEquals(4, temp.filter(r -> r.getSource().startsWith("50|")).count());
Assertions.assertEquals(4, temp.filter(r -> r.getTarget().startsWith("50|")).count());
Assertions.assertEquals(4, temp.filter(r -> r.getSource().startsWith("40|")).count());
Assertions.assertEquals(4, temp.filter(r -> r.getTarget().startsWith("40|")).count());
Assertions.assertEquals(8, temp.filter(r -> r.getSource().startsWith("20|")).count());
Assertions.assertEquals(8, temp.filter(r -> r.getSource().startsWith("20|")).count());
// JavaRDD<Relation> result = temp.filter(r -> r.getSource().startsWith("50|") || r.getTarget().startsWith("50|"));
// Assertions.assertEquals(18, result.count());
// result.foreach(r -> Assertions.assertEquals(ModelConstants.AFFILIATION, r.getSubRelType()));
// result.foreach(r -> Assertions.assertEquals(ModelConstants.RESULT_ORGANIZATION, r.getRelType()));
// result
// .foreach(
// r -> Assertions
// .assertEquals(
// PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance()));
// result
// .foreach(
// r -> Assertions
// .assertEquals(
// PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID,
// r.getDataInfo().getProvenanceaction().getClassid()));
// result
// .foreach(
// r -> Assertions
// .assertEquals(
// PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME,
// r.getDataInfo().getProvenanceaction().getClassname()));
// result
// .foreach(
// r -> Assertions
// .assertEquals(
// "0.85",
// r.getDataInfo().getTrust()));
//
// Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("50|")).count());
// result
// .filter(r -> r.getSource().substring(0, 3).equals("50|"))
// .foreach(r -> Assertions.assertEquals(ModelConstants.HAS_AUTHOR_INSTITUTION, r.getRelClass()));
// Assertions
// .assertEquals(
// 2, result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
// Assertions
// .assertEquals(
// 3, result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
// Assertions
// .assertEquals(
// 2, result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
// Assertions
// .assertEquals(
// 1, result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
// Assertions
// .assertEquals(
// 1, result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
//
// Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("20|")).count());
// result
// .filter(r -> r.getSource().substring(0, 3).equals("20|"))
// .foreach(r -> Assertions.assertEquals(ModelConstants.IS_AUTHOR_INSTITUTION_OF, r.getRelClass()));
// Assertions
// .assertEquals(
// 1, result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
// Assertions
// .assertEquals(
// 1, result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
// Assertions
// .assertEquals(
// 2, result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
// Assertions
// .assertEquals(
// 2, result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
// Assertions
// .assertEquals(
// 3, result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
// .map(r -> r.getTarget())
// .collect()
// .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
// .map(r -> r.getTarget())
// .collect()
// .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
// .map(r -> r.getTarget())
// .collect()
// .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
// .map(r -> r.getTarget())
// .collect()
// .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("50|openaire____::ec653e804967133b9436fdd30d3ff51d"));
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
// .map(r -> r.getTarget())
// .collect()
// .contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
// .map(r -> r.getTarget())
// .collect()
// .contains("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("50|doajarticles::03748bcb5d754c951efec9700e18a56d"));
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
// .map(r -> r.getTarget())
// .collect()
// .contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
//
// Assertions
// .assertTrue(
// result
// .filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
// .map(r -> r.getTarget())
// .collect()
// .contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.resulttoorganizationfromsemrel; package eu.dnetlib.dhp.entitytoorganizationfromsemrel;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@ -73,21 +73,22 @@ public class StepActionsTest {
.execStep( .execStep(
spark, getClass() spark, getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/result")
.getPath(), .getPath(),
workingDir.toString() + "/newRelationPath", workingDir.toString() + "/newRelationPath",
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/")
.getPath(), .getPath(),
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/")
.getPath(), .getPath(),
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath()); .getPath(),
ModelConstants.HAS_AUTHOR_INSTITUTION);
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -203,19 +204,19 @@ public class StepActionsTest {
spark, spark,
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relsforiteration1/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relsforiteration1/")
.getPath(), .getPath(),
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath(), .getPath(),
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/")
.getPath(), .getPath(),
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/")
.getPath(), .getPath(),
workingDir.toString() + "/tempLeaves", workingDir.toString() + "/tempOrgs"); workingDir.toString() + "/tempLeaves", workingDir.toString() + "/tempOrgs");
@ -248,19 +249,19 @@ public class StepActionsTest {
spark, spark,
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relsforiteration1/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relsforiteration1/")
.getPath(), .getPath(),
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath(), .getPath(),
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/")
.getPath(), .getPath(),
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/")
.getPath(), .getPath(),
workingDir.toString() + "/tempLeaves", workingDir.toString() + "/tempOrgs"); workingDir.toString() + "/tempLeaves", workingDir.toString() + "/tempOrgs");

View File

@ -1,325 +0,0 @@
package eu.dnetlib.dhp.resulttoorganizationfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.isSparkSessionManaged;
import static eu.dnetlib.dhp.PropagationConstant.readPath;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.KeyValueSet;
import eu.dnetlib.dhp.PropagationConstant;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class SparkJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(SparkJobTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(StepActionsTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(PrepareInfoJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(PrepareInfoJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void completeExecution() throws Exception {
final String graphPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep")
.getPath();
final String leavesPath = getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/")
.getPath();
final String childParentPath = getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/")
.getPath();
final String resultOrgPath = getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/")
.getPath();
readPath(spark, leavesPath, Leaves.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/leavesInput");
readPath(spark, resultOrgPath, KeyValueSet.class)
.write()
.option("compression", "gzip")
.json(workingDir.toString() + "/orgsInput");
SparkResultToOrganizationFromSemRel
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-relationPath", graphPath,
"-hive_metastore_uris", "",
"-outputPath", workingDir.toString() + "/finalrelation",
"-leavesPath", workingDir.toString() + "/leavesInput",
"-resultOrgPath", workingDir.toString() + "/orgsInput",
"-childParentPath", childParentPath,
"-workingDir", workingDir.toString()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/finalrelation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
Assertions.assertEquals(18, tmp.count());
tmp.foreach(r -> Assertions.assertEquals(ModelConstants.AFFILIATION, r.getSubRelType()));
tmp.foreach(r -> Assertions.assertEquals(ModelConstants.RESULT_ORGANIZATION, r.getRelType()));
tmp
.foreach(
r -> Assertions
.assertEquals(
PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance()));
tmp
.foreach(
r -> Assertions
.assertEquals(
PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID,
r.getDataInfo().getProvenanceaction().getClassid()));
tmp
.foreach(
r -> Assertions
.assertEquals(
PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME,
r.getDataInfo().getProvenanceaction().getClassname()));
tmp
.foreach(
r -> Assertions
.assertEquals(
"0.85",
r.getDataInfo().getTrust()));
Assertions.assertEquals(9, tmp.filter(r -> r.getSource().substring(0, 3).equals("50|")).count());
tmp
.filter(r -> r.getSource().substring(0, 3).equals("50|"))
.foreach(r -> Assertions.assertEquals(ModelConstants.HAS_AUTHOR_INSTITUTION, r.getRelClass()));
Assertions
.assertEquals(
2, tmp.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
3, tmp.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2, tmp.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
1, tmp.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
1, tmp.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions.assertEquals(9, tmp.filter(r -> r.getSource().substring(0, 3).equals("20|")).count());
tmp
.filter(r -> r.getSource().substring(0, 3).equals("20|"))
.foreach(r -> Assertions.assertEquals(ModelConstants.IS_AUTHOR_INSTITUTION_OF, r.getRelClass()));
Assertions
.assertEquals(
1, tmp.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
1, tmp.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2, tmp.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
2, tmp.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
3, tmp.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget())
.collect()
.contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget())
.collect()
.contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget())
.collect()
.contains("50|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget())
.collect()
.contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions
.assertTrue(
tmp
.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget())
.collect()
.contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
}
}

View File

@ -0,0 +1,7 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d","validated":false}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d","validated":false}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d","validated":false}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0","validated":false}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","validated":false}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074","validated":false}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1","validated":false}

View File

@ -0,0 +1,5 @@
{"key":"40|openaire____::ec653e804967133b9436fdd30d3ff51d","valueSet":["20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"]}
{"key":"40|doajarticles::03748bcb5d754c951efec9700e18a56d","valueSet":["20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"]}
{"key":"40|doajarticles::1cae0b82b56ccd97c2db1f698def7074","valueSet":["20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"]}
{"key":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","valueSet":["20|pippo_wf_001::2899e571609779168222fdeb59cb916d","20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"]}
{"key":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","valueSet":["20|pippo_wf_001::2899e571609779168222fdeb59cb916d","20|dedup_wf_001::2899e571609779168222fdeb59cb916d"]}

View File

@ -0,0 +1,7 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"}

View File

@ -0,0 +1,7 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"}

View File

@ -1,14 +0,0 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"}

View File

@ -5,8 +5,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -17,7 +15,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.Constants; import eu.dnetlib.dhp.common.Constants;
@ -27,12 +24,13 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public abstract class AbstractMdRecordToOafMapper { public abstract class AbstractMdRecordToOafMapper {
protected final VocabularyGroup vocs; protected final VocabularyGroup vocs;
protected static final UrlValidator URL_VALIDATOR = UrlValidator.getInstance();
private final boolean invisible; private final boolean invisible;
private final boolean shouldHashId; private final boolean shouldHashId;
@ -393,7 +391,7 @@ public abstract class AbstractMdRecordToOafMapper {
r.setPublisher(preparePublisher(doc, info)); r.setPublisher(preparePublisher(doc, info));
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
r.setSource(prepareSources(doc, info)); r.setSource(prepareSources(doc, info));
r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info)); r.setFulltext(prepareListURL(doc, "//oaf:fulltext", info));
r.setFormat(prepareFormats(doc, info)); r.setFormat(prepareFormats(doc, info));
r.setContributor(prepareContributors(doc, info)); r.setContributor(prepareContributors(doc, info));
r.setResourcetype(prepareResourceType(doc, info)); r.setResourcetype(prepareResourceType(doc, info));
@ -672,6 +670,14 @@ public abstract class AbstractMdRecordToOafMapper {
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
} }
protected List<Field<String>> prepareListURL(final Node node, final String xpath, final DataInfo info) {
return listFields(
info, prepareListString(node, xpath)
.stream()
.filter(URL_VALIDATOR::isValid)
.collect(Collectors.toList()));
}
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) { protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
return field(node.valueOf(xpath), info); return field(node.valueOf(xpath), info);
} }
@ -695,13 +701,13 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected Set<String> validateUrl(Collection<String> url) { protected Set<String> validateUrl(Collection<String> url) {
UrlValidator urlValidator = UrlValidator.getInstance();
if (Objects.isNull(url)) { if (Objects.isNull(url)) {
return new HashSet<>(); return new HashSet<>();
} }
return url return url
.stream() .stream()
.filter(u -> urlValidator.isValid(u)) .filter(URL_VALIDATOR::isValid)
.collect(Collectors.toCollection(HashSet::new)); .collect(Collectors.toCollection(HashSet::new));
} }

View File

@ -140,7 +140,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info); final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); final Set<StructuredProperty> pids = new HashSet<>(pid);
instance instance
.setAlternateIdentifier( .setAlternateIdentifier(
@ -158,6 +158,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
instance instance
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
prepareListURL(doc, "//oaf:fulltext", info)
.stream()
.findFirst()
.map(Field::getValue)
.ifPresent(instance::setFulltext);
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
final List<String> url = nodes final List<String> url = nodes
.stream() .stream()

View File

@ -144,7 +144,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info); final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); final Set<StructuredProperty> pids = new HashSet<>(pid);
instance instance
.setAlternateIdentifier( .setAlternateIdentifier(
@ -161,6 +161,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance instance
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
prepareListURL(doc, "//oaf:fulltext", info)
.stream()
.findFirst()
.map(Field::getValue)
.ifPresent(instance::setFulltext);
final Set<String> url = new HashSet<>(); final Set<String> url = new HashSet<>();
for (final Object o : doc for (final Object o : doc

View File

@ -27,7 +27,6 @@ import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -52,7 +51,7 @@ class MappersTest {
} }
@Test @Test
void testPublication() throws IOException, DocumentException { void testPublication() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml")));
@ -112,13 +111,17 @@ class MappersTest {
assertNotNull(i.getAccessright()); assertNotNull(i.getAccessright());
assertEquals("OPEN", i.getAccessright().getClassid()); assertEquals("OPEN", i.getAccessright().getClassid());
}); });
assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid()); final Instance instance = p.getInstance().get(0);
assertNotNull(p.getInstance().get(0).getPid()); assertEquals("0001", instance.getRefereed().getClassid());
assertTrue(p.getInstance().get(0).getPid().isEmpty()); assertNotNull(instance.getPid());
assertTrue(instance.getPid().isEmpty());
assertTrue(!p.getInstance().get(0).getAlternateIdentifier().isEmpty()); assertFalse(instance.getAlternateIdentifier().isEmpty());
assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); assertEquals("doi", instance.getAlternateIdentifier().get(0).getQualifier().getClassid());
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue()); assertEquals("10.3897/oneeco.2.e13718", instance.getAlternateIdentifier().get(0).getValue());
assertNotNull(instance.getFulltext());
assertEquals("https://oneecosystem.pensoft.net/article/13718/", instance.getFulltext());
assertNotNull(p.getBestaccessright()); assertNotNull(p.getBestaccessright());
assertEquals("OPEN", p.getBestaccessright().getClassid()); assertEquals("OPEN", p.getBestaccessright().getClassid());

View File

@ -807,7 +807,7 @@
<mockito-core.version>3.3.3</mockito-core.version> <mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[3.16.0]</dhp-schemas.version> <dhp-schemas.version>[3.17.1-SNAPSHOT]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>