From 12621b1c45993ea8fa988cb092188307a8a73ca4 Mon Sep 17 00:00:00 2001 From: miconis Date: Sun, 12 Jul 2020 10:13:54 +0200 Subject: [PATCH] implementation of a class to test the clustering functions --- dnet-dedup-test/pom.xml.releaseBackup | 149 ++++++++ .../java/eu/dnetlib/ClusteringTester.java | 115 ++++++ .../java/eu/dnetlib/pace/DedupLocalTest.java | 4 + .../config/organization.current.conf.json | 8 +- dnet-pace-core/pom.xml.releaseBackup | 73 ++++ pom.xml.releaseBackup | 333 ++++++++++++++++++ release.properties | 22 ++ 7 files changed, 700 insertions(+), 4 deletions(-) create mode 100644 dnet-dedup-test/pom.xml.releaseBackup create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/ClusteringTester.java create mode 100644 dnet-pace-core/pom.xml.releaseBackup create mode 100644 pom.xml.releaseBackup create mode 100644 release.properties diff --git a/dnet-dedup-test/pom.xml.releaseBackup b/dnet-dedup-test/pom.xml.releaseBackup new file mode 100644 index 0000000..1e017bd --- /dev/null +++ b/dnet-dedup-test/pom.xml.releaseBackup @@ -0,0 +1,149 @@ + + + + 4.0.0 + + + eu.dnetlib + dnet-dedup + 4.0.2-SNAPSHOT + ../pom.xml + + + dnet-dedup-test + jar + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + org.apache.maven.plugins + maven-deploy-plugin + 2.7 + + true + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + **/*.java + + + + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + + + + + + + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + + + + + + eu.dnetlib + dnet-pace-core + ${project.version} + + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-graphx_2.11 + + + + org.apache.spark + spark-sql_2.11 + + + + junit + junit + test + + + + com.fasterxml.jackson.core + jackson-databind + + + + org.scala-lang + scala-library + + + + + \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/ClusteringTester.java b/dnet-dedup-test/src/main/java/eu/dnetlib/ClusteringTester.java new file mode 100644 index 0000000..b701363 --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/ClusteringTester.java @@ -0,0 +1,115 @@ +package eu.dnetlib; + +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import eu.dnetlib.pace.utils.Utility; +import eu.dnetlib.support.Block; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.DoubleFunction; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + +public class ClusteringTester { + + public static void main(String[] args) throws Exception { + + String configPath = args[0]; + String entitiesPath = args[1]; + + new ClusteringTester() + .run(configPath, entitiesPath); + } + + public void run(String configPath, String entitiesPath) throws IOException { + + DedupConfig dedupConf = DedupConfig.load(readJson(configPath)); + + SparkSession spark = SparkSession + .builder() + .appName("ClusteringTester") + .master("local[*]") + .getOrCreate(); + + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaPairRDD mapDocuments = sc + .textFile(entitiesPath) + .mapToPair( + (PairFunction) s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); + + long totalRecords = mapDocuments.count(); + + // create blocks for deduplication + JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); + + //block_key, cardinality, comparisons + JavaRDD>> blockStats = blocks.map(b -> new Tuple2<>(b._1(), new Tuple2<>((b._2().elements()), comparisonsNumber(b._2(), dedupConf)))); + + Long totalComparisons = blockStats.map(b -> b._2()._2()).reduce((a, b) -> a + b); + + Long blocksNumber = blockStats.count(); + + JavaDoubleRDD blockSizeRDD = blockStats.mapToDouble(b -> Double.parseDouble(b._2()._1().toString())); + + Double maxBlockSize = blockSizeRDD.max(); + + double[] buckets = new double[(int) (maxBlockSize/10 + 3)]; + + double bucketSize = 10.0; + + double bucketBase = 0.0; + for (int i=0; i < buckets.length; i++) { + buckets[i] = bucketBase; + bucketBase += bucketSize; + } + + long[] histogram = blockSizeRDD.histogram(buckets); + + System.out.println("b | n"); + for (int i=0; i< histogram.length; i++) { + System.out.println(buckets[i] + " | " + histogram[i]); + } + + System.out.println("max block size = " + maxBlockSize); + System.out.println("number of records = " + totalRecords); + System.out.println("number of blocks = " + blocksNumber); + System.out.println("total number of comparisons = " + totalComparisons); + } + + //compute the number of comparisons considering the sliding window + public static Long comparisonsNumber(Block b, DedupConfig dedupConfig){ + long blockSize = b.elements(); + long slidingWindowSize = dedupConfig.getWf().getSlidingWindowSize(); + if (slidingWindowSize >= blockSize) + return ((slidingWindowSize*(slidingWindowSize-1))/2); + return (blockSize-slidingWindowSize+1)*((slidingWindowSize*(slidingWindowSize-1))/2); + } + + public String readJson(String fileName) throws IOException { + BufferedReader reader = new BufferedReader(new FileReader(fileName)); + StringBuilder stringBuilder = new StringBuilder(); + char[] buffer = new char[10]; + while (reader.read(buffer) != -1) { + stringBuilder.append(new String(buffer)); + buffer = new char[10]; + } + reader.close(); + + return stringBuilder.toString(); + } + +} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java index 125d375..2e08a69 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace; +import eu.dnetlib.ClusteringTester; import eu.dnetlib.Deduper; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.Field; @@ -8,16 +9,19 @@ import eu.dnetlib.pace.tree.support.TreeProcessor; import eu.dnetlib.pace.tree.support.TreeStats; import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.utils.Utility; +import eu.dnetlib.support.Block; import eu.dnetlib.support.ConnectedComponent; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.SparkSession; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; import scala.Tuple2; +import java.io.IOException; import java.net.URL; import java.util.List; import java.util.Map; diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json index 31b200c..8e3b90e 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json @@ -143,10 +143,10 @@ } }, "model" : [ - { "name" : "country", "type" : "String", "path" : "$.country.classid"}, - { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, - { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, - { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, + { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, + { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"}, { "name" : "originalId", "type" : "String", "path" : "$.id" } ], diff --git a/dnet-pace-core/pom.xml.releaseBackup b/dnet-pace-core/pom.xml.releaseBackup new file mode 100644 index 0000000..0d18645 --- /dev/null +++ b/dnet-pace-core/pom.xml.releaseBackup @@ -0,0 +1,73 @@ + + + + 4.0.0 + + + eu.dnetlib + dnet-dedup + 4.0.2-SNAPSHOT + ../pom.xml + + + dnet-pace-core + jar + + + + edu.cmu + secondstring + + + com.google.guava + guava + + + com.google.code.gson + gson + + + org.apache.commons + commons-lang3 + + + commons-io + commons-io + + + + org.antlr + stringtemplate + + + commons-logging + commons-logging + + + junit + junit + test + + + org.reflections + reflections + + + com.fasterxml.jackson.core + jackson-databind + + + org.apache.commons + commons-math3 + + + + com.jayway.jsonpath + json-path + + + + + + + diff --git a/pom.xml.releaseBackup b/pom.xml.releaseBackup new file mode 100644 index 0000000..d840af1 --- /dev/null +++ b/pom.xml.releaseBackup @@ -0,0 +1,333 @@ + + + + 4.0.0 + + eu.dnetlib + dnet-dedup + 4.0.2-SNAPSHOT + + pom + + http://www.d-net.research-infrastructures.eu + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + + + + scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git + HEAD + + + + dnet-pace-core + dnet-dedup-test + + + + Redmine + https://issue.openaire.research-infrastructures.eu/projects/openaire + + + + + + + dnet45-releases + D-Net 45 Releases + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + default + + + + + + dnet-deps + dnet-dependencies + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps + default + + + dnet45-releases + D-Net 45 Releases + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + default + + true + + + + dnet45-snapshots + D-Net 45 Snapshots + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots + default + + true + + + + + cloudera + Cloudera Repository + https://repository.cloudera.com/artifactory/cloudera-repos + + true + + + false + + + + + target + target/classes + ${project.artifactId}-${project.version} + target/test-classes + + + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.0 + + 1.8 + 1.8 + ${project.build.sourceEncoding} + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + + + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + verify + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.19.1 + + true + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + true + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.0.0 + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.13 + + + integration-test + + integration-test + + + + verify + + verify + + + + + + + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + + + + + + + + edu.cmu + secondstring + 1.0.0 + + + org.antlr + stringtemplate + 3.2 + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + + + + com.fasterxml.jackson.dataformat + jackson-dataformat-xml + ${jackson.version} + + + com.fasterxml.jackson.module + jackson-module-jsonSchema + ${jackson.version} + + + + + + org.apache.commons + commons-math3 + 3.6.1 + + + + com.google.guava + guava + ${google.guava.version} + + + com.google.code.gson + gson + ${google.gson.version} + + + + org.apache.commons + commons-lang3 + ${commons.lang.version} + + + + commons-io + commons-io + ${commons.io.version} + + + commons-collections + commons-collections + ${commons.collections.version} + + + commons-logging + commons-logging + ${commons.logging.version} + + + org.apache.spark + spark-core_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-graphx_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-sql_2.11 + ${spark.version} + provided + + + junit + junit + ${junit.version} + test + + + org.reflections + reflections + 0.9.10 + + + + org.scala-lang + scala-library + ${scala.version} + + + + org.apache.oozie + oozie-client + 5.1.0 + + + com.jayway.jsonpath + json-path + 2.4.0 + + + + + + + + + UTF-8 + UTF-8 + + 2.2.2 + 15.0 + + 2.2.0 + 2.6.6 + + 3.5 + 2.4 + 3.2.1 + 1.1.3 + + 4.9 + 2.11.8 + + false + + diff --git a/release.properties b/release.properties new file mode 100644 index 0000000..70a3c40 --- /dev/null +++ b/release.properties @@ -0,0 +1,22 @@ +#release configuration +#Thu Jul 02 17:06:39 CEST 2020 +scm.commentPrefix=[maven-release-plugin] +pushChanges=true +project.rel.eu.dnetlib\:dnet-dedup-test=4.0.2 +scm.tag=dnet-dedup-4.0.2 +remoteTagging=true +project.scm.eu.dnetlib\:dnet-dedup-test.empty=true +projectVersionPolicyId=default +scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git +scm.tagNameFormat=@{project.artifactId}-@{project.version} +project.rel.eu.dnetlib\:dnet-dedup=4.0.2 +project.dev.eu.dnetlib\:dnet-pace-core=4.0.3-SNAPSHOT +preparationGoals=clean verify +project.scm.eu.dnetlib\:dnet-dedup.tag=HEAD +project.scm.eu.dnetlib\:dnet-dedup.developerConnection=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git +exec.snapshotReleasePluginAllowed=false +project.dev.eu.dnetlib\:dnet-dedup=4.0.3-SNAPSHOT +project.scm.eu.dnetlib\:dnet-pace-core.empty=true +project.dev.eu.dnetlib\:dnet-dedup-test=4.0.3-SNAPSHOT +completedPhase=end-release +project.rel.eu.dnetlib\:dnet-pace-core=4.0.2