dedup wf directory structure aligned with project commons

This commit is contained in:
Claudio Atzori 2020-03-20 16:48:14 +01:00
parent e16e644faf
commit 6cb0a9bff0
30 changed files with 204 additions and 322 deletions

View File

@ -82,10 +82,6 @@
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
</dependency>
</dependencies>

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import eu.dnetlib.dhp.schema.oaf.Field;
import org.apache.commons.lang.StringUtils;

View File

@ -1,11 +1,9 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@ -16,7 +14,6 @@ import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2;
import java.util.Collection;
import java.util.Random;
import static java.util.stream.Collectors.toMap;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler;
@ -13,15 +13,8 @@ import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.DocumentException;
@ -29,15 +22,11 @@ import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import scala.Tuple2;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DedupUtility {
private static final Double THRESHOLD = 0.95;

View File

@ -1,7 +1,6 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
public enum OafEntityType {

View File

@ -1,8 +1,8 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.google.common.hash.Hashing;
import eu.dnetlib.dedup.graph.ConnectedComponent;
import eu.dnetlib.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.dedup.graph.ConnectedComponent;
import eu.dnetlib.dhp.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@ -83,7 +83,7 @@ public class SparkCreateConnectedComponent {
}
public static long getHashcode(final String id) {
return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
return Hashing.murmur3_128().hashString(id).asLong();
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
@ -73,7 +73,10 @@ public class SparkCreateSimRels implements Serializable {
JavaRDD<Relation> relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity));
//save the simrel in the workingdir
spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity));
spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class))
.write()
.mode("overwrite")
.save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity));
//create atomic actions
JavaRDD<Tuple2<Text, Text>> newSimRels = relationsRDD
@ -128,7 +131,6 @@ public class SparkCreateSimRels implements Serializable {
.appName(SparkCreateSimRels.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.enableHiveSupport()
.getOrCreate();
}

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import eu.dnetlib.pace.util.Reporter;
import org.apache.commons.logging.Log;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;

View File

@ -1,7 +1,7 @@
package eu.dnetlib.dedup.graph;
package eu.dnetlib.dhp.dedup.graph;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dedup.DedupUtility;
import eu.dnetlib.dhp.dedup.DedupUtility;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup.graph
package eu.dnetlib.dhp.dedup.graph
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

View File

@ -1,126 +0,0 @@
<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>entity</name>
<description>the entity that should be processed</description>
</property>
<property>
<name>dedupConf</name>
<description>the dedup Configuration</description>
</property>
<property>
<name>targetPath</name>
<description>the target path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="CreateSimRels"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<!-- <action name="DeleteTargetPath">-->
<!-- <fs>-->
<!-- <delete path='${targetPath}/${entity}_simrel'/>-->
<!-- <delete path='${targetPath}/${entity}_mergeRels'/>-->
<!-- </fs>-->
<!-- <ok to="CreateSimRels"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="CreateSimRels">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Similarity Relations</name>
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateConnectedComponents"/>
<error to="Kill"/>
</action>
<action name="CreateConnectedComponents">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Connected Components</name>
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateDedupRecord"/>
<error to="Kill"/>
</action>
<action name="CreateDedupRecord">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Dedup Record</name>
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--dedupPath</arg><arg>${dedupPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -15,12 +15,4 @@
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_db_name</name>
<value>openaire</value>
</property>
</configuration>

View File

@ -47,7 +47,7 @@
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Update Relations</name>
<class>eu.dnetlib.dedup.SparkPropagateRelation</class>
<class>eu.dnetlib.dhp.dedup.SparkPropagateRelation</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -56,7 +56,7 @@
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Merge Relations</name>
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
<class>eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
@ -81,7 +81,7 @@
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Dedup Record</name>
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
<class>eu.dnetlib.dhp.dedup.SparkCreateDedupRecord</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
@ -106,7 +106,7 @@
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Dedup Record</name>
<class>eu.dnetlib.dedup.SparkUpdateEntity</class>
<class>eu.dnetlib.dhp.dedup.SparkUpdateEntity</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -34,6 +34,21 @@
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to="DeleteWorkingPath"/>
<kill name="Kill">
@ -50,20 +65,19 @@
<action name="DuplicateScan">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>Create Similarity Relations</name>
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
<class>eu.dnetlib.dhp.dedup.SparkCreateSimRels</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
spark.sql.warehouse.dir="/user/hive/warehouse"
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>-mt</arg><arg>yarn</arg>
<arg>--i</arg><arg>${graphBasePath}</arg>
<arg>--o</arg><arg>${rawSet}</arg>
<arg>--la</arg><arg>${isLookUpUrl}</arg>

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import eu.dnetlib.dhp.schema.oaf.Publication;
import org.apache.commons.io.IOUtils;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
@ -74,19 +74,9 @@ public class SparkCreateDedupTest {
final HashFunction hashFunction = Hashing.murmur3_128();
System.out.println( s1.hashCode());
System.out.println(hashFunction.hashUnencodedChars(s1).asLong());
System.out.println(hashFunction.hashString(s1).asLong());
System.out.println( s2.hashCode());
System.out.println(hashFunction.hashUnencodedChars(s2).asLong());
}
@Test
public void testJoinEntities() throws Exception{
SparkJoinEntities.main(new String[] {
"-mt", "local[*]",
"-i", "/tmp/dedup",
"-w", "/tmp/dedup",
"-o", "/tmp/dedup",
});
System.out.println(hashFunction.hashString(s2).asLong());
}
}

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup.jpath;
package eu.dnetlib.dhp.dedup.jpath;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;

View File

@ -1,12 +1,14 @@
sparkDriverMemory=10G
sparkExecutorMemory=15G
sparkExecutorCoresForJoining=1
sparkDriverMemoryForJoining=10G
sparkExecutorMemoryForJoining=15G
sparkExecutorCoresForIndexing=64
sparkDriverMemoryForIndexing=3G
sparkExecutorMemoryForIndexing=2G
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
outputPath=/tmp/openaire_provision
format=TMF
batchSize=2000
sparkExecutorCoresForJoining=128
sparkExecutorCoresForIndexing=64
reuseRecords=false
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource

25
pom.xml
View File

@ -148,6 +148,13 @@
<version>${dhp.commons.lang.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${dhp.guava.version}</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
@ -345,22 +352,6 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<version>[6.0.0,7.0.0)</version>
<exclusions>
<exclusion>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
</exclusion>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</dependencyManagement>
@ -512,9 +503,9 @@
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version>
<scala.version>2.11.12</scala.version>
<junit.version>4.12</junit.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
</properties>
</project>