forked from D-Net/dnet-hadoop
dedup wf directory structure aligned with project commons
This commit is contained in:
parent
e16e644faf
commit
6cb0a9bff0
|
@ -82,10 +82,6 @@
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-core</artifactId>
|
<artifactId>jackson-core</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>eu.dnetlib</groupId>
|
|
||||||
<artifactId>dnet-actionmanager-common</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
|
@ -1,11 +1,9 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import org.apache.commons.lang.NotImplementedException;
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
@ -16,7 +14,6 @@ import org.codehaus.jackson.map.ObjectMapper;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Random;
|
|
||||||
|
|
||||||
import static java.util.stream.Collectors.toMap;
|
import static java.util.stream.Collectors.toMap;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import com.wcohen.ss.JaroWinkler;
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
@ -13,15 +13,8 @@ import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
import org.apache.commons.codec.binary.Hex;
|
import org.apache.commons.codec.binary.Hex;
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.spark.SparkContext;
|
import org.apache.spark.SparkContext;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
|
@ -29,15 +22,11 @@ import org.dom4j.Element;
|
||||||
import org.dom4j.io.SAXReader;
|
import org.dom4j.io.SAXReader;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.io.StringWriter;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.security.MessageDigest;
|
import java.security.MessageDigest;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
public class DedupUtility {
|
public class DedupUtility {
|
||||||
private static final Double THRESHOLD = 0.95;
|
private static final Double THRESHOLD = 0.95;
|
|
@ -1,7 +1,6 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.util.BlockProcessor;
|
import eu.dnetlib.pace.util.BlockProcessor;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
public enum OafEntityType {
|
public enum OafEntityType {
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import eu.dnetlib.dedup.graph.ConnectedComponent;
|
import eu.dnetlib.dhp.dedup.graph.ConnectedComponent;
|
||||||
import eu.dnetlib.dedup.graph.GraphProcessor;
|
import eu.dnetlib.dhp.dedup.graph.GraphProcessor;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
@ -83,7 +83,7 @@ public class SparkCreateConnectedComponent {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static long getHashcode(final String id) {
|
public static long getHashcode(final String id) {
|
||||||
return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
|
return Hashing.murmur3_128().hashString(id).asLong();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
|
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
@ -73,7 +73,10 @@ public class SparkCreateSimRels implements Serializable {
|
||||||
JavaRDD<Relation> relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity));
|
JavaRDD<Relation> relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity));
|
||||||
|
|
||||||
//save the simrel in the workingdir
|
//save the simrel in the workingdir
|
||||||
spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity));
|
spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class))
|
||||||
|
.write()
|
||||||
|
.mode("overwrite")
|
||||||
|
.save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity));
|
||||||
|
|
||||||
//create atomic actions
|
//create atomic actions
|
||||||
JavaRDD<Tuple2<Text, Text>> newSimRels = relationsRDD
|
JavaRDD<Tuple2<Text, Text>> newSimRels = relationsRDD
|
||||||
|
@ -128,7 +131,6 @@ public class SparkCreateSimRels implements Serializable {
|
||||||
.appName(SparkCreateSimRels.class.getSimpleName())
|
.appName(SparkCreateSimRels.class.getSimpleName())
|
||||||
.master(parser.get("master"))
|
.master(parser.get("master"))
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.enableHiveSupport()
|
|
||||||
.getOrCreate();
|
.getOrCreate();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import eu.dnetlib.pace.util.Reporter;
|
import eu.dnetlib.pace.util.Reporter;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
@ -1,7 +1,7 @@
|
||||||
package eu.dnetlib.dedup.graph;
|
package eu.dnetlib.dhp.dedup.graph;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dedup.DedupUtility;
|
import eu.dnetlib.dhp.dedup.DedupUtility;
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup.graph
|
package eu.dnetlib.dhp.dedup.graph
|
||||||
|
|
||||||
import org.apache.spark.graphx._
|
import org.apache.spark.graphx._
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
|
@ -1,126 +0,0 @@
|
||||||
<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
|
|
||||||
<parameters>
|
|
||||||
<property>
|
|
||||||
<name>sourcePath</name>
|
|
||||||
<description>the source path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>entity</name>
|
|
||||||
<description>the entity that should be processed</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dedupConf</name>
|
|
||||||
<description>the dedup Configuration</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>targetPath</name>
|
|
||||||
<description>the target path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkDriverMemory</name>
|
|
||||||
<description>memory for driver process</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorMemory</name>
|
|
||||||
<description>memory for individual executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
|
||||||
|
|
||||||
<start to="CreateSimRels"/>
|
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<!-- <action name="DeleteTargetPath">-->
|
|
||||||
<!-- <fs>-->
|
|
||||||
<!-- <delete path='${targetPath}/${entity}_simrel'/>-->
|
|
||||||
<!-- <delete path='${targetPath}/${entity}_mergeRels'/>-->
|
|
||||||
<!-- </fs>-->
|
|
||||||
<!-- <ok to="CreateSimRels"/>-->
|
|
||||||
<!-- <error to="Kill"/>-->
|
|
||||||
<!-- </action>-->
|
|
||||||
|
|
||||||
<action name="CreateSimRels">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Create Similarity Relations</name>
|
|
||||||
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
|
|
||||||
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory} --conf
|
|
||||||
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
|
|
||||||
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
|
|
||||||
spark.sql.warehouse.dir="/user/hive/warehouse"
|
|
||||||
</spark-opts>
|
|
||||||
<arg>-mt</arg><arg>yarn-cluster</arg>
|
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
|
||||||
<arg>--entity</arg><arg>${entity}</arg>
|
|
||||||
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="CreateConnectedComponents"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
|
|
||||||
<action name="CreateConnectedComponents">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Create Connected Components</name>
|
|
||||||
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
|
|
||||||
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory} --conf
|
|
||||||
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
|
|
||||||
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
|
|
||||||
spark.sql.warehouse.dir="/user/hive/warehouse"
|
|
||||||
</spark-opts>
|
|
||||||
<arg>-mt</arg><arg>yarn-cluster</arg>
|
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
|
||||||
<arg>--entity</arg><arg>${entity}</arg>
|
|
||||||
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="CreateDedupRecord"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="CreateDedupRecord">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Create Dedup Record</name>
|
|
||||||
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
|
|
||||||
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory} --conf
|
|
||||||
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
|
|
||||||
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
|
|
||||||
spark.sql.warehouse.dir="/user/hive/warehouse"
|
|
||||||
</spark-opts>
|
|
||||||
<arg>-mt</arg><arg>yarn-cluster</arg>
|
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
|
||||||
<arg>--dedupPath</arg><arg>${dedupPath}</arg>
|
|
||||||
<arg>--entity</arg><arg>${entity}</arg>
|
|
||||||
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
|
||||||
</workflow-app>
|
|
|
@ -15,12 +15,4 @@
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<value>spark2</value>
|
<value>spark2</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>hive_metastore_uris</name>
|
|
||||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>hive_db_name</name>
|
|
||||||
<value>openaire</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
</configuration>
|
|
@ -47,7 +47,7 @@
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Update Relations</name>
|
<name>Update Relations</name>
|
||||||
<class>eu.dnetlib.dedup.SparkPropagateRelation</class>
|
<class>eu.dnetlib.dhp.dedup.SparkPropagateRelation</class>
|
||||||
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory} --conf
|
--driver-memory=${sparkDriverMemory} --conf
|
|
@ -0,0 +1,18 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -56,7 +56,7 @@
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Create Merge Relations</name>
|
<name>Create Merge Relations</name>
|
||||||
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
|
<class>eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent</class>
|
||||||
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory} --conf
|
--driver-memory=${sparkDriverMemory} --conf
|
||||||
|
@ -81,7 +81,7 @@
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Create Dedup Record</name>
|
<name>Create Dedup Record</name>
|
||||||
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
|
<class>eu.dnetlib.dhp.dedup.SparkCreateDedupRecord</class>
|
||||||
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory} --conf
|
--driver-memory=${sparkDriverMemory} --conf
|
||||||
|
@ -106,7 +106,7 @@
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Create Dedup Record</name>
|
<name>Create Dedup Record</name>
|
||||||
<class>eu.dnetlib.dedup.SparkUpdateEntity</class>
|
<class>eu.dnetlib.dhp.dedup.SparkUpdateEntity</class>
|
||||||
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory} --conf
|
--driver-memory=${sparkDriverMemory} --conf
|
|
@ -0,0 +1,18 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -34,6 +34,21 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="DeleteWorkingPath"/>
|
<start to="DeleteWorkingPath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -50,20 +65,19 @@
|
||||||
|
|
||||||
<action name="DuplicateScan">
|
<action name="DuplicateScan">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<master>yarn</master>
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Create Similarity Relations</name>
|
<name>Create Similarity Relations</name>
|
||||||
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
|
<class>eu.dnetlib.dhp.dedup.SparkCreateSimRels</class>
|
||||||
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
<spark-opts>
|
||||||
--driver-memory=${sparkDriverMemory} --conf
|
--executor-memory ${sparkExecutorMemory}
|
||||||
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
|
--executor-cores ${sparkExecutorCores}
|
||||||
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
|
--driver-memory=${sparkDriverMemory}
|
||||||
spark.sql.warehouse.dir="/user/hive/warehouse"
|
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||||
|
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>-mt</arg><arg>yarn-cluster</arg>
|
<arg>-mt</arg><arg>yarn</arg>
|
||||||
<arg>--i</arg><arg>${graphBasePath}</arg>
|
<arg>--i</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--o</arg><arg>${rawSet}</arg>
|
<arg>--o</arg><arg>${rawSet}</arg>
|
||||||
<arg>--la</arg><arg>${isLookUpUrl}</arg>
|
<arg>--la</arg><arg>${isLookUpUrl}</arg>
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dhp.dedup;
|
||||||
|
|
||||||
import com.google.common.hash.HashFunction;
|
import com.google.common.hash.HashFunction;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
|
@ -74,19 +74,9 @@ public class SparkCreateDedupTest {
|
||||||
final HashFunction hashFunction = Hashing.murmur3_128();
|
final HashFunction hashFunction = Hashing.murmur3_128();
|
||||||
|
|
||||||
System.out.println( s1.hashCode());
|
System.out.println( s1.hashCode());
|
||||||
System.out.println(hashFunction.hashUnencodedChars(s1).asLong());
|
System.out.println(hashFunction.hashString(s1).asLong());
|
||||||
System.out.println( s2.hashCode());
|
System.out.println( s2.hashCode());
|
||||||
System.out.println(hashFunction.hashUnencodedChars(s2).asLong());
|
System.out.println(hashFunction.hashString(s2).asLong());
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJoinEntities() throws Exception{
|
|
||||||
SparkJoinEntities.main(new String[] {
|
|
||||||
"-mt", "local[*]",
|
|
||||||
"-i", "/tmp/dedup",
|
|
||||||
"-w", "/tmp/dedup",
|
|
||||||
"-o", "/tmp/dedup",
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dedup.jpath;
|
package eu.dnetlib.dhp.dedup.jpath;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
@ -1,12 +1,14 @@
|
||||||
sparkDriverMemory=10G
|
sparkExecutorCoresForJoining=1
|
||||||
sparkExecutorMemory=15G
|
sparkDriverMemoryForJoining=10G
|
||||||
|
sparkExecutorMemoryForJoining=15G
|
||||||
|
sparkExecutorCoresForIndexing=64
|
||||||
|
sparkDriverMemoryForIndexing=3G
|
||||||
|
sparkExecutorMemoryForIndexing=2G
|
||||||
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
|
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
|
||||||
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
|
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
|
||||||
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
|
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
|
||||||
outputPath=/tmp/openaire_provision
|
outputPath=/tmp/openaire_provision
|
||||||
format=TMF
|
format=TMF
|
||||||
batchSize=2000
|
batchSize=2000
|
||||||
sparkExecutorCoresForJoining=128
|
|
||||||
sparkExecutorCoresForIndexing=64
|
|
||||||
reuseRecords=false
|
reuseRecords=false
|
||||||
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource
|
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource
|
25
pom.xml
25
pom.xml
|
@ -148,6 +148,13 @@
|
||||||
<version>${dhp.commons.lang.version}</version>
|
<version>${dhp.commons.lang.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.google.guava</groupId>
|
||||||
|
<artifactId>guava</artifactId>
|
||||||
|
<version>${dhp.guava.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-codec</groupId>
|
<groupId>commons-codec</groupId>
|
||||||
<artifactId>commons-codec</artifactId>
|
<artifactId>commons-codec</artifactId>
|
||||||
|
@ -345,22 +352,6 @@
|
||||||
</exclusion>
|
</exclusion>
|
||||||
</exclusions>
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>eu.dnetlib</groupId>
|
|
||||||
<artifactId>dnet-actionmanager-common</artifactId>
|
|
||||||
<version>[6.0.0,7.0.0)</version>
|
|
||||||
<exclusions>
|
|
||||||
<exclusion>
|
|
||||||
<groupId>commons-httpclient</groupId>
|
|
||||||
<artifactId>commons-httpclient</artifactId>
|
|
||||||
</exclusion>
|
|
||||||
<exclusion>
|
|
||||||
<groupId>eu.dnetlib</groupId>
|
|
||||||
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
|
|
||||||
</exclusion>
|
|
||||||
</exclusions>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</dependencyManagement>
|
</dependencyManagement>
|
||||||
|
|
||||||
|
@ -512,9 +503,9 @@
|
||||||
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
|
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
|
||||||
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
||||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||||
|
<dhp.guava.version>11.0.2</dhp.guava.version>
|
||||||
<scala.version>2.11.12</scala.version>
|
<scala.version>2.11.12</scala.version>
|
||||||
<junit.version>4.12</junit.version>
|
<junit.version>4.12</junit.version>
|
||||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||||
</properties>
|
</properties>
|
||||||
</project>
|
</project>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue