forked from D-Net/dnet-hadoop
code refactoring (useless classes and wf removed), implementation of the test for the openorgs dedup
This commit is contained in:
parent
2355cc4e9b
commit
f446580e9f
|
@ -94,6 +94,12 @@
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.h2database</groupId>
|
||||||
|
<artifactId>h2</artifactId>
|
||||||
|
<version>1.4.200</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,184 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.StreamSupport;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.sql.*;
|
|
||||||
import org.dom4j.DocumentException;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
public class SparkCollectSimRels extends AbstractSparkAction {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkCollectSimRels.class);
|
|
||||||
|
|
||||||
Dataset<Row> simGroupsDS;
|
|
||||||
Dataset<Row> groupsDS;
|
|
||||||
|
|
||||||
public SparkCollectSimRels(ArgumentApplicationParser parser, SparkSession spark, Dataset<Row> simGroupsDS,
|
|
||||||
Dataset<Row> groupsDS) {
|
|
||||||
super(parser, spark);
|
|
||||||
this.simGroupsDS = simGroupsDS;
|
|
||||||
this.groupsDS = groupsDS;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
SparkBlockStats.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json")));
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
|
|
||||||
final String dbUrl = parser.get("postgresUrl");
|
|
||||||
final String dbUser = parser.get("postgresUser");
|
|
||||||
final String dbPassword = parser.get("postgresPassword");
|
|
||||||
|
|
||||||
SparkSession spark = getSparkSession(conf);
|
|
||||||
|
|
||||||
DataFrameReader readOptions = spark
|
|
||||||
.read()
|
|
||||||
.format("jdbc")
|
|
||||||
.option("url", dbUrl)
|
|
||||||
.option("user", dbUser)
|
|
||||||
.option("password", dbPassword);
|
|
||||||
|
|
||||||
new SparkCollectSimRels(
|
|
||||||
parser,
|
|
||||||
spark,
|
|
||||||
readOptions.option("dbtable", "similarity_groups").load(),
|
|
||||||
readOptions.option("dbtable", "groups").load())
|
|
||||||
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
void run(ISLookUpService isLookUpService) throws DocumentException, ISLookUpException, IOException {
|
|
||||||
|
|
||||||
// read oozie parameters
|
|
||||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
|
||||||
final String actionSetId = parser.get("actionSetId");
|
|
||||||
final String workingPath = parser.get("workingPath");
|
|
||||||
final int numPartitions = Optional
|
|
||||||
.ofNullable(parser.get("numPartitions"))
|
|
||||||
.map(Integer::valueOf)
|
|
||||||
.orElse(NUM_PARTITIONS);
|
|
||||||
final String dbUrl = parser.get("postgresUrl");
|
|
||||||
final String dbUser = parser.get("postgresUser");
|
|
||||||
|
|
||||||
log.info("numPartitions: '{}'", numPartitions);
|
|
||||||
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
|
||||||
log.info("actionSetId: '{}'", actionSetId);
|
|
||||||
log.info("workingPath: '{}'", workingPath);
|
|
||||||
log.info("postgresUser: {}", dbUser);
|
|
||||||
log.info("postgresUrl: {}", dbUrl);
|
|
||||||
log.info("postgresPassword: xxx");
|
|
||||||
|
|
||||||
JavaPairRDD<String, List<String>> similarityGroup = simGroupsDS
|
|
||||||
.toJavaRDD()
|
|
||||||
.mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1)))
|
|
||||||
.groupByKey()
|
|
||||||
.mapToPair(
|
|
||||||
i -> new Tuple2<>(i._1(), StreamSupport
|
|
||||||
.stream(i._2().spliterator(), false)
|
|
||||||
.collect(Collectors.toList())));
|
|
||||||
|
|
||||||
JavaPairRDD<String, String> groupIds = groupsDS
|
|
||||||
.toJavaRDD()
|
|
||||||
.mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1)));
|
|
||||||
|
|
||||||
JavaRDD<Tuple2<Tuple2<String, String>, List<String>>> groups = similarityGroup
|
|
||||||
.leftOuterJoin(groupIds)
|
|
||||||
.filter(g -> g._2()._2().isPresent())
|
|
||||||
.map(g -> new Tuple2<>(new Tuple2<>(g._1(), g._2()._2().get()), g._2()._1()));
|
|
||||||
|
|
||||||
JavaRDD<Relation> relations = groups.flatMap(g -> {
|
|
||||||
String firstId = g._2().get(0);
|
|
||||||
List<Relation> rels = new ArrayList<>();
|
|
||||||
|
|
||||||
for (String id : g._2()) {
|
|
||||||
if (!firstId.equals(id))
|
|
||||||
rels.add(createSimRel(firstId, id, g._1()._2()));
|
|
||||||
}
|
|
||||||
|
|
||||||
return rels.iterator();
|
|
||||||
});
|
|
||||||
|
|
||||||
Dataset<Relation> resultRelations = spark
|
|
||||||
.createDataset(
|
|
||||||
relations.filter(r -> r.getRelType().equals("resultResult")).rdd(),
|
|
||||||
Encoders.bean(Relation.class))
|
|
||||||
.repartition(numPartitions);
|
|
||||||
|
|
||||||
Dataset<Relation> organizationRelations = spark
|
|
||||||
.createDataset(
|
|
||||||
relations.filter(r -> r.getRelType().equals("organizationOrganization")).rdd(),
|
|
||||||
Encoders.bean(Relation.class))
|
|
||||||
.repartition(numPartitions);
|
|
||||||
|
|
||||||
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
|
||||||
switch (dedupConf.getWf().getSubEntityValue()) {
|
|
||||||
case "organization":
|
|
||||||
savePostgresRelation(organizationRelations, workingPath, actionSetId, "organization");
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
savePostgresRelation(
|
|
||||||
resultRelations, workingPath, actionSetId, dedupConf.getWf().getSubEntityValue());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private Relation createSimRel(String source, String target, String entity) {
|
|
||||||
final Relation r = new Relation();
|
|
||||||
r.setSubRelType("dedupSimilarity");
|
|
||||||
r.setRelClass("isSimilarTo");
|
|
||||||
r.setDataInfo(new DataInfo());
|
|
||||||
|
|
||||||
switch (entity) {
|
|
||||||
case "result":
|
|
||||||
r.setSource("50|" + source);
|
|
||||||
r.setTarget("50|" + target);
|
|
||||||
r.setRelType("resultResult");
|
|
||||||
break;
|
|
||||||
case "organization":
|
|
||||||
r.setSource("20|" + source);
|
|
||||||
r.setTarget("20|" + target);
|
|
||||||
r.setRelType("organizationOrganization");
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void savePostgresRelation(Dataset<Relation> newRelations, String workingPath, String actionSetId,
|
|
||||||
String entityType) {
|
|
||||||
newRelations
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Append)
|
|
||||||
.parquet(DedupUtility.createSimRelPath(workingPath, actionSetId, entityType));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -9,6 +9,7 @@ import java.util.Optional;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.function.ForeachFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.graphx.Edge;
|
import org.apache.spark.graphx.Edge;
|
||||||
import org.apache.spark.rdd.RDD;
|
import org.apache.spark.rdd.RDD;
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"paramName": "la",
|
|
||||||
"paramLongName": "isLookUpUrl",
|
|
||||||
"paramDescription": "address for the LookUp",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "asi",
|
|
||||||
"paramLongName": "actionSetId",
|
|
||||||
"paramDescription": "action set identifier (name of the orchestrator)",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "w",
|
|
||||||
"paramLongName": "workingPath",
|
|
||||||
"paramDescription": "path of the working directory",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "np",
|
|
||||||
"paramLongName": "numPartitions",
|
|
||||||
"paramDescription": "number of partitions for the similarity relations intermediate phases",
|
|
||||||
"paramRequired": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "purl",
|
|
||||||
"paramLongName": "postgresUrl",
|
|
||||||
"paramDescription": "the url of the postgres server",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "pusr",
|
|
||||||
"paramLongName": "postgresUser",
|
|
||||||
"paramDescription": "the owner of the postgres database",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "ppwd",
|
|
||||||
"paramLongName": "postgresPassword",
|
|
||||||
"paramDescription": "the password for the postgres user",
|
|
||||||
"paramRequired": true
|
|
||||||
}
|
|
||||||
]
|
|
|
@ -1,18 +0,0 @@
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>jobTracker</name>
|
|
||||||
<value>yarnRM</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>nameNode</name>
|
|
||||||
<value>hdfs://nameservice1</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.use.system.libpath</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>spark2</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
|
@ -1,208 +0,0 @@
|
||||||
<workflow-app name="Organization Dedup" xmlns="uri:oozie:workflow:0.5">
|
|
||||||
<parameters>
|
|
||||||
<property>
|
|
||||||
<name>graphBasePath</name>
|
|
||||||
<description>the raw graph base path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>isLookUpUrl</name>
|
|
||||||
<description>the address of the lookUp service</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>actionSetId</name>
|
|
||||||
<description>id of the actionSet</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>workingPath</name>
|
|
||||||
<description>path for the working directory</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dedupGraphPath</name>
|
|
||||||
<description>path for the output graph</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>cutConnectedComponent</name>
|
|
||||||
<description>max number of elements in a connected component</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dbUrl</name>
|
|
||||||
<description>the url of the database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dbUser</name>
|
|
||||||
<description>the user of the database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dbTable</name>
|
|
||||||
<description>the name of the table in the database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dbPwd</name>
|
|
||||||
<description>the passowrd of the user of the database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkDriverMemory</name>
|
|
||||||
<description>memory for driver process</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorMemory</name>
|
|
||||||
<description>memory for individual executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozieActionShareLibForSpark2</name>
|
|
||||||
<description>oozie action sharelib for spark 2.*</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2ExtraListeners</name>
|
|
||||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
|
||||||
<description>spark 2.* extra listeners classname</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2SqlQueryExecutionListeners</name>
|
|
||||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
|
||||||
<description>spark 2.* sql query execution listeners classname</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2YarnHistoryServerAddress</name>
|
|
||||||
<description>spark 2.* yarn history server address</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2EventLogDir</name>
|
|
||||||
<description>spark 2.* event log dir location</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
|
||||||
|
|
||||||
<global>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>mapreduce.job.queuename</name>
|
|
||||||
<value>${queueName}</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
|
||||||
<value>${oozieLauncherQueueName}</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>${oozieActionShareLibForSpark2}</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
</global>
|
|
||||||
|
|
||||||
<start to="resetWorkingPath"/>
|
|
||||||
|
|
||||||
<kill name="Kill">
|
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<action name="resetWorkingPath">
|
|
||||||
<fs>
|
|
||||||
<delete path="${workingPath}"/>
|
|
||||||
</fs>
|
|
||||||
<ok to="copyRelations"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copyRelations">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>-pb</arg>
|
|
||||||
<arg>${graphBasePath}/relation</arg>
|
|
||||||
<arg>${workingPath}/${actionSetId}/organization_simrel</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="CreateSimRel"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="CreateSimRel">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Create Similarity Relations</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
|
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
|
||||||
<arg>--numPartitions</arg><arg>8000</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="CreateMergeRel"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="CreateMergeRel">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Create Merge Relations</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class>
|
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
|
||||||
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="PrepareNewOrgs"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="PrepareNewOrgs">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Prepare New Organizations</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkPrepareNewOrgs</class>
|
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
|
||||||
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
|
|
||||||
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
|
||||||
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
|
||||||
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
|
|
||||||
<arg>--numConnections</arg><arg>20</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
|
||||||
</workflow-app>
|
|
|
@ -1,18 +0,0 @@
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>jobTracker</name>
|
|
||||||
<value>yarnRM</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>nameNode</name>
|
|
||||||
<value>hdfs://nameservice1</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.use.system.libpath</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>spark2</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
|
@ -1,240 +0,0 @@
|
||||||
<workflow-app name="Organization Dedup" xmlns="uri:oozie:workflow:0.5">
|
|
||||||
<parameters>
|
|
||||||
<property>
|
|
||||||
<name>graphBasePath</name>
|
|
||||||
<description>the raw graph base path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>isLookUpUrl</name>
|
|
||||||
<description>the address of the lookUp service</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>actionSetId</name>
|
|
||||||
<description>id of the actionSet</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>workingPath</name>
|
|
||||||
<description>path for the working directory</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dedupGraphPath</name>
|
|
||||||
<description>path for the output graph</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>cutConnectedComponent</name>
|
|
||||||
<description>max number of elements in a connected component</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dbUrl</name>
|
|
||||||
<description>the url of the database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dbUser</name>
|
|
||||||
<description>the user of the database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dbTable</name>
|
|
||||||
<description>the name of the table in the database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dbPwd</name>
|
|
||||||
<description>the passowrd of the user of the database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkDriverMemory</name>
|
|
||||||
<description>memory for driver process</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorMemory</name>
|
|
||||||
<description>memory for individual executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozieActionShareLibForSpark2</name>
|
|
||||||
<description>oozie action sharelib for spark 2.*</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2ExtraListeners</name>
|
|
||||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
|
||||||
<description>spark 2.* extra listeners classname</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2SqlQueryExecutionListeners</name>
|
|
||||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
|
||||||
<description>spark 2.* sql query execution listeners classname</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2YarnHistoryServerAddress</name>
|
|
||||||
<description>spark 2.* yarn history server address</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2EventLogDir</name>
|
|
||||||
<description>spark 2.* event log dir location</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
|
||||||
|
|
||||||
<global>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>mapreduce.job.queuename</name>
|
|
||||||
<value>${queueName}</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
|
||||||
<value>${oozieLauncherQueueName}</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>${oozieActionShareLibForSpark2}</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
</global>
|
|
||||||
|
|
||||||
<start to="resetWorkingPath"/>
|
|
||||||
|
|
||||||
<kill name="Kill">
|
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<action name="resetWorkingPath">
|
|
||||||
<fs>
|
|
||||||
<delete path="${workingPath}"/>
|
|
||||||
</fs>
|
|
||||||
<ok to="copyRelations"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copyRelations">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>-pb</arg>
|
|
||||||
<arg>/tmp/graph_openorgs_and_corda/relation</arg>
|
|
||||||
<arg>${workingPath}/${actionSetId}/organization_simrel</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="CreateSimRel"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="CreateSimRel">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Create Similarity Relations</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
|
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
|
||||||
<arg>--numPartitions</arg><arg>8000</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="CreateMergeRel"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="CreateMergeRel">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Create Merge Relations</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class>
|
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
|
||||||
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="PrepareOrgRels"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="PrepareOrgRels">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Prepare Organization Relations</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkPrepareOrgRels</class>
|
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
|
||||||
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
|
|
||||||
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
|
||||||
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
|
||||||
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
|
|
||||||
<arg>--numConnections</arg><arg>20</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="PrepareNewOrgs"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="PrepareNewOrgs">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Prepare New Organizations</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkPrepareNewOrgs</class>
|
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
|
||||||
<arg>--apiUrl</arg><arg>${apiUrl}</arg>
|
|
||||||
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
|
|
||||||
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
|
||||||
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
|
||||||
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
|
|
||||||
<arg>--numConnections</arg><arg>20</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
|
||||||
</workflow-app>
|
|
|
@ -204,76 +204,8 @@ public class SparkDedupTest implements Serializable {
|
||||||
assertEquals(6750, orp_simrel);
|
assertEquals(6750, orp_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Disabled
|
|
||||||
@Test
|
@Test
|
||||||
@Order(2)
|
@Order(2)
|
||||||
public void collectSimRelsTest() throws Exception {
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
SparkCollectSimRels.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json")));
|
|
||||||
parser
|
|
||||||
.parseArgument(
|
|
||||||
new String[] {
|
|
||||||
"-asi", testActionSetId,
|
|
||||||
"-la", "lookupurl",
|
|
||||||
"-w", testOutputBasePath,
|
|
||||||
"-np", "50",
|
|
||||||
"-purl", "jdbc:postgresql://localhost:5432/dnet_dedup",
|
|
||||||
"-pusr", "postgres_user",
|
|
||||||
"-ppwd", ""
|
|
||||||
});
|
|
||||||
|
|
||||||
new SparkCollectSimRels(
|
|
||||||
parser,
|
|
||||||
spark,
|
|
||||||
spark.read().load(testDedupAssertionsBasePath + "/similarity_groups"),
|
|
||||||
spark.read().load(testDedupAssertionsBasePath + "/groups"))
|
|
||||||
.run(isLookUpService);
|
|
||||||
|
|
||||||
long orgs_simrel = spark
|
|
||||||
.read()
|
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
|
|
||||||
.count();
|
|
||||||
|
|
||||||
long pubs_simrel = spark
|
|
||||||
.read()
|
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
|
|
||||||
.count();
|
|
||||||
|
|
||||||
long sw_simrel = spark
|
|
||||||
.read()
|
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
|
|
||||||
.count();
|
|
||||||
|
|
||||||
long ds_simrel = spark
|
|
||||||
.read()
|
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
|
|
||||||
.count();
|
|
||||||
|
|
||||||
long orp_simrel = spark
|
|
||||||
.read()
|
|
||||||
.json(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
|
||||||
.count();
|
|
||||||
|
|
||||||
// System.out.println("orgs_simrel = " + orgs_simrel);
|
|
||||||
// System.out.println("pubs_simrel = " + pubs_simrel);
|
|
||||||
// System.out.println("sw_simrel = " + sw_simrel);
|
|
||||||
// System.out.println("ds_simrel = " + ds_simrel);
|
|
||||||
// System.out.println("orp_simrel = " + orp_simrel);
|
|
||||||
|
|
||||||
assertEquals(3672, orgs_simrel);
|
|
||||||
assertEquals(10459, pubs_simrel);
|
|
||||||
assertEquals(3767, sw_simrel);
|
|
||||||
assertEquals(3865, ds_simrel);
|
|
||||||
assertEquals(10173, orp_simrel);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Order(3)
|
|
||||||
public void cutMergeRelsTest() throws Exception {
|
public void cutMergeRelsTest() throws Exception {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -369,7 +301,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(4)
|
@Order(3)
|
||||||
public void createMergeRelsTest() throws Exception {
|
public void createMergeRelsTest() throws Exception {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -424,7 +356,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(5)
|
@Order(4)
|
||||||
public void createDedupRecordTest() throws Exception {
|
public void createDedupRecordTest() throws Exception {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -471,7 +403,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(6)
|
@Order(5)
|
||||||
public void updateEntityTest() throws Exception {
|
public void updateEntityTest() throws Exception {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -587,7 +519,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(7)
|
@Order(6)
|
||||||
public void propagateRelationTest() throws Exception {
|
public void propagateRelationTest() throws Exception {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -637,7 +569,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(8)
|
@Order(7)
|
||||||
public void testRelations() throws Exception {
|
public void testRelations() throws Exception {
|
||||||
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_1.json", 12, 10);
|
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_1.json", 12, 10);
|
||||||
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_2.json", 10, 2);
|
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_2.json", 10, 2);
|
||||||
|
|
|
@ -0,0 +1,408 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import static java.nio.file.Files.createTempDirectory;
|
||||||
|
|
||||||
|
import static org.apache.spark.sql.functions.count;
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.ResultSet;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.ForeachFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.apache.spark.util.CollectionsUtils;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.junit.platform.commons.util.StringUtils;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
|
public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
|
|
||||||
|
private static String dbUrl = "jdbc:h2:mem:openorgs_test;DB_CLOSE_DELAY=-1;DATABASE_TO_UPPER=false";
|
||||||
|
private static String dbUser = "sa";
|
||||||
|
private static String dbTable = "tmp_dedup_events";
|
||||||
|
private static String dbPwd = "";
|
||||||
|
|
||||||
|
@Mock(serializable = true)
|
||||||
|
ISLookUpService isLookUpService;
|
||||||
|
|
||||||
|
protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
|
private static SparkSession spark;
|
||||||
|
private static JavaSparkContext jsc;
|
||||||
|
|
||||||
|
private static String testGraphBasePath;
|
||||||
|
private static String testOutputBasePath;
|
||||||
|
private static String testDedupGraphBasePath;
|
||||||
|
private static final String testActionSetId = "test-orchestrator-openorgs";
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void cleanUp() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
|
testGraphBasePath = Paths
|
||||||
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/openorgs_dedup").toURI())
|
||||||
|
.toFile()
|
||||||
|
.getAbsolutePath();
|
||||||
|
testOutputBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-")
|
||||||
|
.toAbsolutePath()
|
||||||
|
.toString();
|
||||||
|
testDedupGraphBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-")
|
||||||
|
.toAbsolutePath()
|
||||||
|
.toString();
|
||||||
|
|
||||||
|
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
||||||
|
FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
|
||||||
|
|
||||||
|
final SparkConf conf = new SparkConf();
|
||||||
|
conf.set("spark.sql.shuffle.partitions", "200");
|
||||||
|
spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(SparkDedupTest.class.getSimpleName())
|
||||||
|
.master("local[*]")
|
||||||
|
.config(conf)
|
||||||
|
.getOrCreate();
|
||||||
|
|
||||||
|
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException, ISLookUpException {
|
||||||
|
|
||||||
|
lenient()
|
||||||
|
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
|
||||||
|
.thenReturn(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkDedupTest.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_openorgs.xml")));
|
||||||
|
|
||||||
|
lenient()
|
||||||
|
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization")))
|
||||||
|
.thenReturn(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkDedupTest.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(1)
|
||||||
|
public void createSimRelsTest() throws Exception {
|
||||||
|
|
||||||
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkCreateSimRels.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
|
||||||
|
|
||||||
|
parser
|
||||||
|
.parseArgument(
|
||||||
|
new String[] {
|
||||||
|
"-i", testGraphBasePath,
|
||||||
|
"-asi", testActionSetId,
|
||||||
|
"-la", "lookupurl",
|
||||||
|
"-w", testOutputBasePath,
|
||||||
|
"-np", "50"
|
||||||
|
});
|
||||||
|
|
||||||
|
new SparkCreateSimRels(parser, spark).run(isLookUpService);
|
||||||
|
|
||||||
|
long orgs_simrel = spark
|
||||||
|
.read()
|
||||||
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
|
.count();
|
||||||
|
|
||||||
|
assertEquals(288, orgs_simrel);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(2)
|
||||||
|
public void copyOpenorgsSimRels() throws Exception {
|
||||||
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkCopyOpenorgsSimRels.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/dedup/copyOpenorgsMergeRels_parameters.json")));
|
||||||
|
parser
|
||||||
|
.parseArgument(
|
||||||
|
new String[] {
|
||||||
|
"-i", testGraphBasePath,
|
||||||
|
"-asi", testActionSetId,
|
||||||
|
"-w", testOutputBasePath,
|
||||||
|
"-la", "lookupurl",
|
||||||
|
"-np", "50"
|
||||||
|
});
|
||||||
|
|
||||||
|
new SparkCopyOpenorgsSimRels(parser, spark).run(isLookUpService);
|
||||||
|
|
||||||
|
long orgs_simrel = spark
|
||||||
|
.read()
|
||||||
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
|
.count();
|
||||||
|
|
||||||
|
assertEquals(324, orgs_simrel);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(3)
|
||||||
|
public void createMergeRelsTest() throws Exception {
|
||||||
|
|
||||||
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkCreateMergeRels.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
|
||||||
|
|
||||||
|
parser
|
||||||
|
.parseArgument(
|
||||||
|
new String[] {
|
||||||
|
"-i",
|
||||||
|
testGraphBasePath,
|
||||||
|
"-asi",
|
||||||
|
testActionSetId,
|
||||||
|
"-la",
|
||||||
|
"lookupurl",
|
||||||
|
"-w",
|
||||||
|
testOutputBasePath
|
||||||
|
});
|
||||||
|
|
||||||
|
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
||||||
|
|
||||||
|
long orgs_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||||
|
.count();
|
||||||
|
assertEquals(132, orgs_mergerel);
|
||||||
|
|
||||||
|
// verify that a DiffRel is in the mergerels (to be sure that the job supposed to remove them has something to
|
||||||
|
// do)
|
||||||
|
List<String> diffRels = jsc
|
||||||
|
.textFile(DedupUtility.createEntityPath(testGraphBasePath, "relation"))
|
||||||
|
.map(s -> OBJECT_MAPPER.readValue(s, Relation.class))
|
||||||
|
.filter(r -> r.getRelClass().equals("isDifferentFrom"))
|
||||||
|
.map(r -> r.getTarget())
|
||||||
|
.collect();
|
||||||
|
assertEquals(18, diffRels.size());
|
||||||
|
|
||||||
|
List<String> mergeRels = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(r -> r.getTarget())
|
||||||
|
.collect();
|
||||||
|
assertFalse(Collections.disjoint(mergeRels, diffRels));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(4)
|
||||||
|
public void prepareOrgRelsTest() throws Exception {
|
||||||
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkCreateSimRels.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/dedup/prepareOrgRels_parameters.json")));
|
||||||
|
parser
|
||||||
|
.parseArgument(
|
||||||
|
new String[] {
|
||||||
|
"-i",
|
||||||
|
testGraphBasePath,
|
||||||
|
"-asi",
|
||||||
|
testActionSetId,
|
||||||
|
"-la",
|
||||||
|
"lookupurl",
|
||||||
|
"-w",
|
||||||
|
testOutputBasePath,
|
||||||
|
"-du",
|
||||||
|
dbUrl,
|
||||||
|
"-dusr",
|
||||||
|
dbUser,
|
||||||
|
"-t",
|
||||||
|
dbTable,
|
||||||
|
"-dpwd",
|
||||||
|
dbPwd
|
||||||
|
});
|
||||||
|
|
||||||
|
new SparkPrepareOrgRels(parser, spark).run(isLookUpService);
|
||||||
|
|
||||||
|
final Properties connectionProperties = new Properties();
|
||||||
|
connectionProperties.put("user", dbUser);
|
||||||
|
connectionProperties.put("password", dbPwd);
|
||||||
|
|
||||||
|
Connection connection = DriverManager.getConnection(dbUrl, connectionProperties);
|
||||||
|
|
||||||
|
ResultSet resultSet = connection
|
||||||
|
.prepareStatement("SELECT COUNT(*) as total_rels FROM " + dbTable)
|
||||||
|
.executeQuery();
|
||||||
|
if (resultSet.next()) {
|
||||||
|
int total_rels = resultSet.getInt("total_rels");
|
||||||
|
assertEquals(32, total_rels);
|
||||||
|
} else
|
||||||
|
fail("No result in the sql DB");
|
||||||
|
resultSet.close();
|
||||||
|
|
||||||
|
// verify the number of organizations with duplicates
|
||||||
|
ResultSet resultSet2 = connection
|
||||||
|
.prepareStatement("SELECT COUNT(DISTINCT(local_id)) as total_orgs FROM " + dbTable)
|
||||||
|
.executeQuery();
|
||||||
|
if (resultSet2.next()) {
|
||||||
|
int total_orgs = resultSet2.getInt("total_orgs");
|
||||||
|
assertEquals(6, total_orgs);
|
||||||
|
} else
|
||||||
|
fail("No result in the sql DB");
|
||||||
|
resultSet2.close();
|
||||||
|
|
||||||
|
// verify that no DiffRel is in the DB
|
||||||
|
List<String> diffRels = jsc
|
||||||
|
.textFile(DedupUtility.createEntityPath(testGraphBasePath, "relation"))
|
||||||
|
.map(s -> OBJECT_MAPPER.readValue(s, Relation.class))
|
||||||
|
.filter(r -> r.getRelClass().equals("isDifferentFrom"))
|
||||||
|
.map(r -> r.getSource() + "@@@" + r.getTarget())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
List<String> dbRels = new ArrayList<>();
|
||||||
|
ResultSet resultSet3 = connection
|
||||||
|
.prepareStatement("SELECT local_id, oa_original_id FROM " + dbTable)
|
||||||
|
.executeQuery();
|
||||||
|
while (resultSet3.next()) {
|
||||||
|
String source = OafMapperUtils.createOpenaireId("organization", resultSet3.getString("local_id"), true);
|
||||||
|
String target = OafMapperUtils
|
||||||
|
.createOpenaireId("organization", resultSet3.getString("oa_original_id"), true);
|
||||||
|
dbRels.add(source + "@@@" + target);
|
||||||
|
}
|
||||||
|
resultSet3.close();
|
||||||
|
assertTrue(Collections.disjoint(dbRels, diffRels));
|
||||||
|
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(5)
|
||||||
|
public void prepareNewOrgsTest() throws Exception {
|
||||||
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkCreateSimRels.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/dedup/prepareNewOrgs_parameters.json")));
|
||||||
|
parser
|
||||||
|
.parseArgument(
|
||||||
|
new String[] {
|
||||||
|
"-i",
|
||||||
|
testGraphBasePath,
|
||||||
|
"-asi",
|
||||||
|
testActionSetId,
|
||||||
|
"-la",
|
||||||
|
"lookupurl",
|
||||||
|
"-w",
|
||||||
|
testOutputBasePath,
|
||||||
|
"-du",
|
||||||
|
dbUrl,
|
||||||
|
"-dusr",
|
||||||
|
dbUser,
|
||||||
|
"-t",
|
||||||
|
dbTable,
|
||||||
|
"-dpwd",
|
||||||
|
dbPwd
|
||||||
|
});
|
||||||
|
|
||||||
|
new SparkPrepareNewOrgs(parser, spark).run(isLookUpService);
|
||||||
|
|
||||||
|
final Properties connectionProperties = new Properties();
|
||||||
|
connectionProperties.put("user", dbUser);
|
||||||
|
connectionProperties.put("password", dbPwd);
|
||||||
|
|
||||||
|
long orgs_in_diffrel = jsc
|
||||||
|
.textFile(DedupUtility.createEntityPath(testGraphBasePath, "relation"))
|
||||||
|
.map(s -> OBJECT_MAPPER.readValue(s, Relation.class))
|
||||||
|
.filter(r -> r.getRelClass().equals("isDifferentFrom"))
|
||||||
|
.map(r -> r.getTarget())
|
||||||
|
.distinct()
|
||||||
|
.count();
|
||||||
|
|
||||||
|
Connection connection = DriverManager.getConnection(dbUrl, connectionProperties);
|
||||||
|
|
||||||
|
jsc
|
||||||
|
.textFile(DedupUtility.createEntityPath(testGraphBasePath, "relation"))
|
||||||
|
.map(s -> OBJECT_MAPPER.readValue(s, Relation.class))
|
||||||
|
.filter(r -> r.getRelClass().equals("isDifferentFrom"))
|
||||||
|
.map(r -> r.getTarget())
|
||||||
|
.distinct()
|
||||||
|
.foreach(s -> System.out.println("difforgs = " + s));
|
||||||
|
ResultSet resultSet0 = connection
|
||||||
|
.prepareStatement("SELECT oa_original_id FROM " + dbTable + " WHERE local_id = ''")
|
||||||
|
.executeQuery();
|
||||||
|
while (resultSet0.next())
|
||||||
|
System.out
|
||||||
|
.println(
|
||||||
|
"dborgs = " + OafMapperUtils.createOpenaireId(20, resultSet0.getString("oa_original_id"), true));
|
||||||
|
resultSet0.close();
|
||||||
|
|
||||||
|
ResultSet resultSet = connection
|
||||||
|
.prepareStatement("SELECT COUNT(*) as total_new_orgs FROM " + dbTable + " WHERE local_id = ''")
|
||||||
|
.executeQuery();
|
||||||
|
if (resultSet.next()) {
|
||||||
|
int total_new_orgs = resultSet.getInt("total_new_orgs");
|
||||||
|
assertEquals(orgs_in_diffrel + 1, total_new_orgs);
|
||||||
|
} else
|
||||||
|
fail("No result in the sql DB");
|
||||||
|
resultSet.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void finalCleanUp() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
||||||
|
FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -40,4 +40,4 @@ SELECT
|
||||||
0.99 AS trust,
|
0.99 AS trust,
|
||||||
'' AS inferenceprovenance,
|
'' AS inferenceprovenance,
|
||||||
'isDifferentFrom' AS relclass
|
'isDifferentFrom' AS relclass
|
||||||
FROM oa_duplicates WHERE reltype = 'is_different'
|
FROM oa_duplicates WHERE reltype = 'is_different';
|
Loading…
Reference in New Issue