adjusted parameters for the dedup stats workflow

This commit is contained in:
Claudio Atzori 2020-07-13 19:26:46 +02:00
parent 03ecfa5ebd
commit 66f9f6d323
4 changed files with 237 additions and 246 deletions

View File

@ -1,8 +1,8 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import java.io.IOException; import java.io.IOException;
import eu.dnetlib.dhp.oa.dedup.model.BlockStats;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
@ -15,8 +15,10 @@ import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.dhp.oa.dedup.model.Block;
import eu.dnetlib.dhp.oa.dedup.model.BlockStats;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -45,12 +47,6 @@ public class SparkBlockStats extends AbstractSparkAction {
parser.parseArgument(args); parser.parseArgument(args);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf
.registerKryoClasses(
new Class[] {
MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class
});
new SparkCreateSimRels(parser, getSparkSession(conf)) new SparkCreateSimRels(parser, getSparkSession(conf))
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
@ -93,13 +89,13 @@ public class SparkBlockStats extends AbstractSparkAction {
// create blocks for deduplication // create blocks for deduplication
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
JavaRDD<BlockStats> blockStats = blocks.map(b -> JavaRDD<BlockStats> blockStats = blocks
new BlockStats( .map(
b -> new BlockStats(
b._1(), b._1(),
(long) b._2().getDocuments().size(), (long) b._2().getDocuments().size(),
computeComparisons( computeComparisons(
(long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize())) (long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize())));
);
// save the blockstats in the workingdir // save the blockstats in the workingdir
spark spark

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.oa.dedup.model; package eu.dnetlib.dhp.oa.dedup.model;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,4 +1,4 @@
<workflow-app name="Duplicate Scan" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Create dedup blocks" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>graphBasePath</name> <name>graphBasePath</name>
@ -12,14 +12,6 @@
<name>actionSetId</name> <name>actionSetId</name>
<description>id of the actionSet</description> <description>id of the actionSet</description>
</property> </property>
<property>
<name>workingPath</name>
<description>path for the working directory</description>
</property>
<property>
<name>dedupGraphPath</name>
<description>path for the output graph</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -85,7 +77,7 @@
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Create Similarity Relations</name> <name>Create deduplication blocks</name>
<class>eu.dnetlib.dhp.oa.dedup.SparkBlockStats</class> <class>eu.dnetlib.dhp.oa.dedup.SparkBlockStats</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar> <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
@ -101,7 +93,7 @@
<arg>--i</arg><arg>${graphBasePath}</arg> <arg>--i</arg><arg>${graphBasePath}</arg>
<arg>--la</arg><arg>${isLookUpUrl}</arg> <arg>--la</arg><arg>${isLookUpUrl}</arg>
<arg>--asi</arg><arg>${actionSetId}</arg> <arg>--asi</arg><arg>${actionSetId}</arg>
<arg>--w</arg><arg>${workingPath}</arg> <arg>--w</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -1,8 +1,17 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static java.nio.file.Files.createTempDirectory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.lenient;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -14,15 +23,9 @@ import org.mockito.Mock;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import java.io.File; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.io.IOException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import java.io.Serializable; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import static java.nio.file.Files.createTempDirectory;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.lenient;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
public class SparkStatsTest implements Serializable { public class SparkStatsTest implements Serializable {
@ -172,4 +175,3 @@ public class SparkStatsTest implements Serializable {
assertEquals(55, orp_blocks); assertEquals(55, orp_blocks);
} }
} }