adjusted parameters for the dedup stats workflow
This commit is contained in:
parent
03ecfa5ebd
commit
66f9f6d323
|
@ -1,8 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.model.BlockStats;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
|
@ -15,8 +15,10 @@ import org.apache.spark.sql.SparkSession;
|
|||
import org.dom4j.DocumentException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.BlockStats;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
@ -45,12 +47,6 @@ public class SparkBlockStats extends AbstractSparkAction {
|
|||
parser.parseArgument(args);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
conf
|
||||
.registerKryoClasses(
|
||||
new Class[] {
|
||||
MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class
|
||||
});
|
||||
|
||||
new SparkCreateSimRels(parser, getSparkSession(conf))
|
||||
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
||||
|
@ -93,13 +89,13 @@ public class SparkBlockStats extends AbstractSparkAction {
|
|||
// create blocks for deduplication
|
||||
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
|
||||
|
||||
JavaRDD<BlockStats> blockStats = blocks.map(b ->
|
||||
new BlockStats(
|
||||
JavaRDD<BlockStats> blockStats = blocks
|
||||
.map(
|
||||
b -> new BlockStats(
|
||||
b._1(),
|
||||
(long) b._2().getDocuments().size(),
|
||||
computeComparisons(
|
||||
(long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize()))
|
||||
);
|
||||
(long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize())));
|
||||
|
||||
// save the blockstats in the workingdir
|
||||
spark
|
||||
|
@ -110,7 +106,7 @@ public class SparkBlockStats extends AbstractSparkAction {
|
|||
}
|
||||
}
|
||||
|
||||
public Long computeComparisons(Long blockSize, Long slidingWindowSize){
|
||||
public Long computeComparisons(Long blockSize, Long slidingWindowSize) {
|
||||
|
||||
if (slidingWindowSize >= blockSize)
|
||||
return (slidingWindowSize * (slidingWindowSize - 1)) / 2;
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class BlockStats implements Serializable {
|
||||
|
||||
private String key; //key of the block
|
||||
private Long size; //number of elements in the block
|
||||
private Long comparisons; //number of comparisons in the block
|
||||
private String key; // key of the block
|
||||
private Long size; // number of elements in the block
|
||||
private Long comparisons; // number of comparisons in the block
|
||||
|
||||
public BlockStats() {
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Duplicate Scan" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Create dedup blocks" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphBasePath</name>
|
||||
|
@ -12,14 +12,6 @@
|
|||
<name>actionSetId</name>
|
||||
<description>id of the actionSet</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>path for the working directory</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>dedupGraphPath</name>
|
||||
<description>path for the output graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -85,7 +77,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create Similarity Relations</name>
|
||||
<name>Create deduplication blocks</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.SparkBlockStats</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -101,7 +93,7 @@
|
|||
<arg>--i</arg><arg>${graphBasePath}</arg>
|
||||
<arg>--la</arg><arg>${isLookUpUrl}</arg>
|
||||
<arg>--asi</arg><arg>${actionSetId}</arg>
|
||||
<arg>--w</arg><arg>${workingPath}</arg>
|
||||
<arg>--w</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -1,8 +1,17 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import static java.nio.file.Files.createTempDirectory;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -14,15 +23,9 @@ import org.mockito.Mock;
|
|||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import static java.nio.file.Files.createTempDirectory;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class SparkStatsTest implements Serializable {
|
||||
|
@ -172,4 +175,3 @@ public class SparkStatsTest implements Serializable {
|
|||
assertEquals(55, orp_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue