dedup wf directory structure aligned with project commons

This commit is contained in:
Claudio Atzori 2020-03-20 16:48:14 +01:00
parent e16e644faf
commit 6cb0a9bff0
30 changed files with 204 additions and 322 deletions

View File

@ -82,10 +82,6 @@
<groupId>com.fasterxml.jackson.core</groupId> <groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId> <artifactId>jackson-core</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
</dependency>
</dependencies> </dependencies>

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;

View File

@ -1,11 +1,9 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
@ -16,7 +14,6 @@ import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2; import scala.Tuple2;
import java.util.Collection; import java.util.Collection;
import java.util.Random;
import static java.util.stream.Collectors.toMap; import static java.util.stream.Collectors.toMap;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler; import com.wcohen.ss.JaroWinkler;
@ -13,15 +13,8 @@ import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkContext; import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
@ -29,15 +22,11 @@ import org.dom4j.Element;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import scala.Tuple2; import scala.Tuple2;
import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest; import java.security.MessageDigest;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DedupUtility { public class DedupUtility {
private static final Double THRESHOLD = 0.95; private static final Double THRESHOLD = 0.95;

View File

@ -1,7 +1,6 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor; import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
public enum OafEntityType { public enum OafEntityType {

View File

@ -1,8 +1,8 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import eu.dnetlib.dedup.graph.ConnectedComponent; import eu.dnetlib.dhp.dedup.graph.ConnectedComponent;
import eu.dnetlib.dedup.graph.GraphProcessor; import eu.dnetlib.dhp.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@ -83,7 +83,7 @@ public class SparkCreateConnectedComponent {
} }
public static long getHashcode(final String id) { public static long getHashcode(final String id) {
return Hashing.murmur3_128().hashUnencodedChars(id).asLong(); return Hashing.murmur3_128().hashString(id).asLong();
} }
private static SparkSession getSparkSession(ArgumentApplicationParser parser) { private static SparkSession getSparkSession(ArgumentApplicationParser parser) {

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
@ -73,7 +73,10 @@ public class SparkCreateSimRels implements Serializable {
JavaRDD<Relation> relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity)); JavaRDD<Relation> relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity));
//save the simrel in the workingdir //save the simrel in the workingdir
spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class))
.write()
.mode("overwrite")
.save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity));
//create atomic actions //create atomic actions
JavaRDD<Tuple2<Text, Text>> newSimRels = relationsRDD JavaRDD<Tuple2<Text, Text>> newSimRels = relationsRDD
@ -128,7 +131,6 @@ public class SparkCreateSimRels implements Serializable {
.appName(SparkCreateSimRels.class.getSimpleName()) .appName(SparkCreateSimRels.class.getSimpleName())
.master(parser.get("master")) .master(parser.get("master"))
.config(conf) .config(conf)
.enableHiveSupport()
.getOrCreate(); .getOrCreate();
} }

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import eu.dnetlib.pace.util.Reporter; import eu.dnetlib.pace.util.Reporter;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;

View File

@ -1,7 +1,7 @@
package eu.dnetlib.dedup.graph; package eu.dnetlib.dhp.dedup.graph;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dedup.DedupUtility; import eu.dnetlib.dhp.dedup.DedupUtility;
import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore; import org.codehaus.jackson.annotate.JsonIgnore;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup.graph package eu.dnetlib.dhp.dedup.graph
import org.apache.spark.graphx._ import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD

View File

@ -1,126 +0,0 @@
<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>entity</name>
<description>the entity that should be processed</description>
</property>
<property>
<name>dedupConf</name>
<description>the dedup Configuration</description>
</property>
<property>
<name>targetPath</name>
<description>the target path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="CreateSimRels"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<!-- <action name="DeleteTargetPath">-->
<!-- <fs>-->
<!-- <delete path='${targetPath}/${entity}_simrel'/>-->
<!-- <delete path='${targetPath}/${entity}_mergeRels'/>-->
<!-- </fs>-->
<!-- <ok to="CreateSimRels"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="CreateSimRels">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Similarity Relations</name>
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateConnectedComponents"/>
<error to="Kill"/>
</action>
<action name="CreateConnectedComponents">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Connected Components</name>
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateDedupRecord"/>
<error to="Kill"/>
</action>
<action name="CreateDedupRecord">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Dedup Record</name>
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--dedupPath</arg><arg>${dedupPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -15,12 +15,4 @@
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark2</value>
</property> </property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_db_name</name>
<value>openaire</value>
</property>
</configuration> </configuration>

View File

@ -47,7 +47,7 @@
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Update Relations</name> <name>Update Relations</name>
<class>eu.dnetlib.dedup.SparkPropagateRelation</class> <class>eu.dnetlib.dhp.dedup.SparkPropagateRelation</class>
<jar>dhp-dedup-${projectVersion}.jar</jar> <jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf --driver-memory=${sparkDriverMemory} --conf

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -56,7 +56,7 @@
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Create Merge Relations</name> <name>Create Merge Relations</name>
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class> <class>eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent</class>
<jar>dhp-dedup-${projectVersion}.jar</jar> <jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf --driver-memory=${sparkDriverMemory} --conf
@ -81,7 +81,7 @@
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Create Dedup Record</name> <name>Create Dedup Record</name>
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class> <class>eu.dnetlib.dhp.dedup.SparkCreateDedupRecord</class>
<jar>dhp-dedup-${projectVersion}.jar</jar> <jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf --driver-memory=${sparkDriverMemory} --conf
@ -106,7 +106,7 @@
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Create Dedup Record</name> <name>Create Dedup Record</name>
<class>eu.dnetlib.dedup.SparkUpdateEntity</class> <class>eu.dnetlib.dhp.dedup.SparkUpdateEntity</class>
<jar>dhp-dedup-${projectVersion}.jar</jar> <jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf --driver-memory=${sparkDriverMemory} --conf

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -34,6 +34,21 @@
</property> </property>
</parameters> </parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to="DeleteWorkingPath"/> <start to="DeleteWorkingPath"/>
<kill name="Kill"> <kill name="Kill">
@ -50,20 +65,19 @@
<action name="DuplicateScan"> <action name="DuplicateScan">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <master>yarn</master>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Create Similarity Relations</name> <name>Create Similarity Relations</name>
<class>eu.dnetlib.dedup.SparkCreateSimRels</class> <class>eu.dnetlib.dhp.dedup.SparkCreateSimRels</class>
<jar>dhp-dedup-${projectVersion}.jar</jar> <jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} <spark-opts>
--driver-memory=${sparkDriverMemory} --conf --executor-memory ${sparkExecutorMemory}
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf --executor-cores ${sparkExecutorCores}
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf --driver-memory=${sparkDriverMemory}
spark.sql.warehouse.dir="/user/hive/warehouse" --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
</spark-opts> </spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg> <arg>-mt</arg><arg>yarn</arg>
<arg>--i</arg><arg>${graphBasePath}</arg> <arg>--i</arg><arg>${graphBasePath}</arg>
<arg>--o</arg><arg>${rawSet}</arg> <arg>--o</arg><arg>${rawSet}</arg>
<arg>--la</arg><arg>${isLookUpUrl}</arg> <arg>--la</arg><arg>${isLookUpUrl}</arg>

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.dedup;
import com.google.common.hash.HashFunction; import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
@ -74,19 +74,9 @@ public class SparkCreateDedupTest {
final HashFunction hashFunction = Hashing.murmur3_128(); final HashFunction hashFunction = Hashing.murmur3_128();
System.out.println( s1.hashCode()); System.out.println( s1.hashCode());
System.out.println(hashFunction.hashUnencodedChars(s1).asLong()); System.out.println(hashFunction.hashString(s1).asLong());
System.out.println( s2.hashCode()); System.out.println( s2.hashCode());
System.out.println(hashFunction.hashUnencodedChars(s2).asLong()); System.out.println(hashFunction.hashString(s2).asLong());
}
@Test
public void testJoinEntities() throws Exception{
SparkJoinEntities.main(new String[] {
"-mt", "local[*]",
"-i", "/tmp/dedup",
"-w", "/tmp/dedup",
"-o", "/tmp/dedup",
});
} }
} }

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dedup.jpath; package eu.dnetlib.dhp.dedup.jpath;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;

View File

@ -1,12 +1,14 @@
sparkDriverMemory=10G sparkExecutorCoresForJoining=1
sparkExecutorMemory=15G sparkDriverMemoryForJoining=10G
sparkExecutorMemoryForJoining=15G
sparkExecutorCoresForIndexing=64
sparkDriverMemoryForIndexing=3G
sparkExecutorMemoryForIndexing=2G
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp #isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03 sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
outputPath=/tmp/openaire_provision outputPath=/tmp/openaire_provision
format=TMF format=TMF
batchSize=2000 batchSize=2000
sparkExecutorCoresForJoining=128
sparkExecutorCoresForIndexing=64
reuseRecords=false reuseRecords=false
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource

225
pom.xml
View File

@ -1,6 +1,6 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" <project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
@ -101,12 +101,12 @@
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId> <artifactId>hadoop-common</artifactId>
<version>${dhp.hadoop.version}</version> <version>${dhp.hadoop.version}</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId> <artifactId>hadoop-client</artifactId>
<version>${dhp.hadoop.version}</version> <version>${dhp.hadoop.version}</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
@ -148,6 +148,13 @@
<version>${dhp.commons.lang.version}</version> <version>${dhp.commons.lang.version}</version>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${dhp.guava.version}</version>
</dependency>
<dependency> <dependency>
<groupId>commons-codec</groupId> <groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId> <artifactId>commons-codec</artifactId>
@ -167,11 +174,11 @@
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>net.sf.saxon</groupId> <groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId> <artifactId>Saxon-HE</artifactId>
<version>9.9.1-6</version> <version>9.9.1-6</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>dom4j</groupId> <groupId>dom4j</groupId>
@ -192,56 +199,56 @@
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.mycila.xmltool</groupId> <groupId>com.mycila.xmltool</groupId>
<artifactId>xmltool</artifactId> <artifactId>xmltool</artifactId>
<version>3.3</version> <version>3.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.solr</groupId> <groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId> <artifactId>solr-solrj</artifactId>
<version>7.5.0</version> <version>7.5.0</version>
<exclusions> <exclusions>
<exclusion> <exclusion>
<artifactId>*</artifactId> <artifactId>*</artifactId>
<groupId>*</groupId> <groupId>*</groupId>
</exclusion> </exclusion>
</exclusions> </exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.lucidworks.spark</groupId> <groupId>com.lucidworks.spark</groupId>
<artifactId>spark-solr</artifactId> <artifactId>spark-solr</artifactId>
<version>3.6.0</version> <version>3.6.0</version>
<exclusions> <exclusions>
<exclusion> <exclusion>
<artifactId>*</artifactId> <artifactId>*</artifactId>
<groupId>*</groupId> <groupId>*</groupId>
</exclusion> </exclusion>
</exclusions> </exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
<version>4.5.3</version> <version>4.5.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId> <artifactId>httpmime</artifactId>
<version>4.5.3</version> <version>4.5.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.noggit</groupId> <groupId>org.noggit</groupId>
<artifactId>noggit</artifactId> <artifactId>noggit</artifactId>
<version>0.8</version> <version>0.8</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.zookeeper</groupId> <groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId> <artifactId>zookeeper</artifactId>
<version>3.4.11</version> <version>3.4.11</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>net.schmizz</groupId> <groupId>net.schmizz</groupId>
<artifactId>sshj</artifactId> <artifactId>sshj</artifactId>
<version>0.10.0</version> <version>0.10.0</version>
@ -283,17 +290,17 @@
<artifactId>dnet-pace-core</artifactId> <artifactId>dnet-pace-core</artifactId>
<version>4.0.0</version> <version>4.0.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId> <artifactId>cnr-rmi-api</artifactId>
<version>[2.0.0,3.0.0)</version> <version>[2.0.0,3.0.0)</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.cxf</groupId> <groupId>org.apache.cxf</groupId>
<artifactId>cxf-rt-transports-http</artifactId> <artifactId>cxf-rt-transports-http</artifactId>
<version>3.1.5</version> <version>3.1.5</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>javax.persistence</groupId> <groupId>javax.persistence</groupId>
<artifactId>javax.persistence-api</artifactId> <artifactId>javax.persistence-api</artifactId>
@ -301,36 +308,36 @@
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.rabbitmq</groupId> <groupId>com.rabbitmq</groupId>
<artifactId>amqp-client</artifactId> <artifactId>amqp-client</artifactId>
<version>5.6.0</version> <version>5.6.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>
<version>2.4.0</version> <version>2.4.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.arakelian</groupId> <groupId>com.arakelian</groupId>
<artifactId>java-jq</artifactId> <artifactId>java-jq</artifactId>
<version>0.10.1</version> <version>0.10.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>edu.cmu</groupId> <groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId> <artifactId>secondstring</artifactId>
<version>1.0.0</version> <version>1.0.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.mongodb</groupId> <groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId> <artifactId>mongo-java-driver</artifactId>
<version>${mongodb.driver.version}</version> <version>${mongodb.driver.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.antlr</groupId> <groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId> <artifactId>stringtemplate</artifactId>
<version>4.0</version> <version>4.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.oozie</groupId> <groupId>org.apache.oozie</groupId>
@ -345,22 +352,6 @@
</exclusion> </exclusion>
</exclusions> </exclusions>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<version>[6.0.0,7.0.0)</version>
<exclusions>
<exclusion>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
</exclusion>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>
@ -512,9 +503,9 @@
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version> <dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dhp.jackson.version>2.9.6</dhp.jackson.version> <dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version> <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version>
<scala.version>2.11.12</scala.version> <scala.version>2.11.12</scala.version>
<junit.version>4.12</junit.version> <junit.version>4.12</junit.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
</properties> </properties>
</project> </project>