From f738c2b6410527b8c29cf63bae85c7a77c11a697 Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 3 Apr 2019 09:40:14 +0200 Subject: [PATCH] addition of a sparktester test, implementation of 2 different classes for testing in dnet-dedup-test module, addition of new terms in the vocabulary and change in the implementation of the JaroWinklerNormalizedName comparator --- .DS_Store | Bin 6148 -> 8196 bytes dependencies.txt | 252 +++++++++++ dnet-dedup-test/.DS_Store | Bin 6148 -> 6148 bytes dnet-dedup-test/pom.xml | 62 ++- .../src/main/java/eu/dnetlib/.DS_Store | Bin 0 -> 6148 bytes .../main/java/eu/dnetlib/SparkLocalTest.java | 128 ++++++ .../src/main/java/eu/dnetlib/SparkTest.java | 47 +- .../src/main/java/eu/dnetlib/Utility.java | 50 +++ .../java/eu/dnetlib/pace/DedupTestIT.java | 3 +- .../java/eu/dnetlib/pace/SparkTester.java | 17 + dnet-dedup.ipr | 109 +++++ dnet-dedup.iws | 418 ++++++++++++++++++ dnet-openaire-data-protos/.DS_Store | Bin 0 -> 6148 bytes dnet-openaire-data-protos/pom.xml | 2 +- dnet-pace-core/pom.xml | 1 + .../java/eu/dnetlib/pace/model/FieldDef.java | 27 +- .../eu/dnetlib/pace/config/ConfigTest.java | 2 +- .../eu/dnetlib/pace/config/org.curr.conf | 36 ++ pom.xml | 160 ++++++- 19 files changed, 1249 insertions(+), 65 deletions(-) create mode 100644 dependencies.txt create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/.DS_Store create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/Utility.java create mode 100644 dnet-dedup-test/src/test/java/eu/dnetlib/pace/SparkTester.java create mode 100644 dnet-dedup.ipr create mode 100644 dnet-dedup.iws create mode 100644 dnet-openaire-data-protos/.DS_Store create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf diff --git a/.DS_Store b/.DS_Store index f316c9219296a9a98d8caa439cdb8bb91f2e01fc..6e3f1637d5b17caf360870ae4679b8b5b7b5cbdb 100644 GIT binary patch literal 8196 zcmeHMU2GIp6h5aEIx~g#7WrvAP&PE!23lwfw(=uxe?tWVVYjpt=rX%=fr-6IF(EcaO*AIH`D9FdFvg#A=gtx+@L&@}<4khz zx%ZxP?!EinZ|>Q%3jo+tFxvrY0YIT5B&($ACXdjIa>G*=jZ~0C@&{-I6F8Luw+q?B8SBLX6KucnOMp}ak~E`M*OJ_jEH)6dyi*R{o)S9gu6OG!Q_M*}SI zs3nqrgPaU%h zDlmeA-(z?gwk&0tJZpHvmKhvPbEn_tMegkKb{GNo_6y@Q?T8Fy+sN*<_(X;+>UEvK zupI7%^T|053|o%rPV~41$Mi${X!5YV+X#-*JW69Sdx*9OpekyH)lN=!bab>N*0Y4`jj1?X`_0s9Huf6`pg^QsVV01A-jqG2oFHvh2u$533 z^UT8C_KUp|^|BXlhPpoZfVLyU78VJ&UHQOr9e*$Pe3BVf?GwI(IoH|kaiJg1xpraP z@iVL@XV|%dO^48J+bQcf_eTPLED)_x-}Qpt&@jVT!RLE8*;z&2N7nv`Mbim0hU&Tb zi?0!xs0{8hl4h zJH|MlW=odV*469!OzYAtBZaK>P{9fwW{rw6&8n&wG_26I114={r)!GYO!l6lhML9| zy1rj%dOc3H896jH>ss0uJ2oQ2mR3zmKS*3+zA9bYJ2ZXBHnQAirYhIz?9%k%qC}Ey zj|Mky(e#m=J3dYy#jnAZXgJxe>tkZ&9flw5%5#Tyby+M=pRS*FwNL1}O;hZN_}MHe z6(XQECfj6};$|JJr6{=#4$w=Ohm-I$oPuZIG@OO=@Gg7^pTK4K6h4D1@C|$qKf=%O z8~hG`z@Mn#VqA)~ScfZdEv~~h+=yE-iQU+P+i^GU!4#(PAm)(cVLXC)Jch?{5+A|G z@f1Fb&*AgXX@hyBC-^UAh5ij9K_&I)o({g`hrT1cIwBJbJx73&&v7_sxZ0u&l zuJ0evw%r+Ge?4EYxn|*AixxLDHn(NC#yTEc}alQatHRiaOL(ARNxTGC!y zUgG&o{tpp5`7uRMj8#bMMEy#ISj80CT~*(t5cOGE*wt3olck}sIa0e$YgdTMY_8O9 z&^lEjJ1dvkjoM~~IL+ow*DbAGa^F@>V<6s1Ra&pASLt^A+dMr3=imeQm>7BmzJ%}K z2lxqoMZ^lM3Aw4`GHk#mY{u2N23xTcyNIQma0~Y10B*xUVrm+Pa2Q8$l$dIUOtsKM z9}8F{#y*NC@dl`6(!ThRg0%o28S=Mcd1zEocit$(X~NaWfH zDu<|uN7zkC8M;s!x`2QD| C=aNnU delta 110 zcmZp1XfcprU|?W$DortDU=RQ@Ie-{MGqg`E6q~50$jCG?z------------------------ +[INFO] Building dnet-dedup 3.0.3-SNAPSHOT [1/3] +[INFO] --------------------------------[ pom ]--------------------------------- +[INFO] +[INFO] --- maven-dependency-plugin:3.0.0:tree (default-cli) @ dnet-dedup --- +[INFO] eu.dnetlib:dnet-dedup:pom:3.0.3-SNAPSHOT +[INFO] +[INFO] ---------------------< eu.dnetlib:dnet-pace-core >---------------------- +[INFO] Building dnet-pace-core 3.0.3-SNAPSHOT [2/3] +[INFO] --------------------------------[ jar ]--------------------------------- +[INFO] +[INFO] --- maven-dependency-plugin:3.0.0:tree (default-cli) @ dnet-pace-core --- +[INFO] eu.dnetlib:dnet-pace-core:jar:3.0.3-SNAPSHOT +[INFO] +- edu.cmu:secondstring:jar:1.0.0:compile +[INFO] +- com.google.guava:guava:jar:15.0:compile +[INFO] +- com.google.code.gson:gson:jar:2.2.2:compile +[INFO] +- commons-lang:commons-lang:jar:2.6:compile +[INFO] +- commons-io:commons-io:jar:2.4:compile +[INFO] +- commons-collections:commons-collections:jar:3.2.1:compile +[INFO] +- com.googlecode.protobuf-java-format:protobuf-java-format:jar:1.2:compile +[INFO] +- org.antlr:stringtemplate:jar:3.2:compile +[INFO] | \- org.antlr:antlr:jar:2.7.7:compile +[INFO] +- commons-logging:commons-logging:jar:1.1.3:compile +[INFO] +- junit:junit:jar:4.9:test +[INFO] | \- org.hamcrest:hamcrest-core:jar:1.1:test +[INFO] +- org.reflections:reflections:jar:0.9.10:compile +[INFO] | +- org.javassist:javassist:jar:3.19.0-GA:compile +[INFO] | \- com.google.code.findbugs:annotations:jar:2.0.1:compile +[INFO] +- com.fasterxml.jackson.core:jackson-databind:jar:2.6.6:compile +[INFO] | +- com.fasterxml.jackson.core:jackson-annotations:jar:2.6.0:compile +[INFO] | \- com.fasterxml.jackson.core:jackson-core:jar:2.6.6:compile +[INFO] +- org.codehaus.jackson:jackson-mapper-asl:jar:1.9.13:compile +[INFO] | \- org.codehaus.jackson:jackson-core-asl:jar:1.9.13:compile +[INFO] \- org.apache.commons:commons-math3:jar:3.6.1:compile +[INFO] +[INFO] ---------------------< eu.dnetlib:dnet-dedup-test >--------------------- +[INFO] Building dnet-dedup-test 3.0.3-SNAPSHOT [3/3] +[INFO] --------------------------------[ jar ]--------------------------------- +[INFO] +[INFO] --- maven-dependency-plugin:3.0.0:tree (default-cli) @ dnet-dedup-test --- +[INFO] eu.dnetlib:dnet-dedup-test:jar:3.0.3-SNAPSHOT +[INFO] +- eu.dnetlib:dnet-pace-core:jar:3.0.3-SNAPSHOT:compile +[INFO] | +- edu.cmu:secondstring:jar:1.0.0:compile +[INFO] | +- com.google.guava:guava:jar:15.0:compile +[INFO] | +- com.google.code.gson:gson:jar:2.2.2:compile +[INFO] | +- commons-lang:commons-lang:jar:2.6:compile +[INFO] | +- commons-io:commons-io:jar:2.4:compile +[INFO] | +- commons-collections:commons-collections:jar:3.2.1:compile +[INFO] | +- com.googlecode.protobuf-java-format:protobuf-java-format:jar:1.2:compile +[INFO] | +- org.antlr:stringtemplate:jar:3.2:compile +[INFO] | | \- org.antlr:antlr:jar:2.7.7:compile +[INFO] | +- commons-logging:commons-logging:jar:1.1.3:compile +[INFO] | +- org.reflections:reflections:jar:0.9.10:compile +[INFO] | | +- org.javassist:javassist:jar:3.19.0-GA:compile +[INFO] | | \- com.google.code.findbugs:annotations:jar:2.0.1:compile +[INFO] | +- com.fasterxml.jackson.core:jackson-databind:jar:2.6.6:compile +[INFO] | | +- com.fasterxml.jackson.core:jackson-annotations:jar:2.6.0:compile +[INFO] | | \- com.fasterxml.jackson.core:jackson-core:jar:2.6.6:compile +[INFO] | +- org.codehaus.jackson:jackson-mapper-asl:jar:1.9.13:compile +[INFO] | | \- org.codehaus.jackson:jackson-core-asl:jar:1.9.13:compile +[INFO] | \- org.apache.commons:commons-math3:jar:3.6.1:compile +[INFO] +- eu.dnetlib:dnet-openaire-data-protos:jar:3.9.3-proto250:compile +[INFO] | +- com.google.protobuf:protobuf-java:jar:2.5.0:compile +[INFO] | \- log4j:log4j:jar:1.2.17:compile (version selected from constraint [1.2.17,1.2.17]) +[INFO] +- org.apache.spark:spark-core_2.11:jar:2.2.0:provided +[INFO] | +- org.apache.avro:avro:jar:1.7.7:provided +[INFO] | | +- com.thoughtworks.paranamer:paranamer:jar:2.3:provided +[INFO] | | \- org.apache.commons:commons-compress:jar:1.4.1:provided +[INFO] | | \- org.tukaani:xz:jar:1.0:provided +[INFO] | +- org.apache.avro:avro-mapred:jar:hadoop2:1.7.7:provided +[INFO] | | +- org.apache.avro:avro-ipc:jar:1.7.7:provided +[INFO] | | \- org.apache.avro:avro-ipc:jar:tests:1.7.7:provided +[INFO] | +- com.twitter:chill_2.11:jar:0.8.0:provided +[INFO] | | \- com.esotericsoftware:kryo-shaded:jar:3.0.3:provided +[INFO] | | +- com.esotericsoftware:minlog:jar:1.3.0:provided +[INFO] | | \- org.objenesis:objenesis:jar:2.1:provided +[INFO] | +- com.twitter:chill-java:jar:0.8.0:provided +[INFO] | +- org.apache.xbean:xbean-asm5-shaded:jar:4.4:provided +[INFO] | +- org.apache.hadoop:hadoop-client:jar:2.6.5:provided +[INFO] | | +- org.apache.hadoop:hadoop-common:jar:2.6.5:provided +[INFO] | | | +- commons-cli:commons-cli:jar:1.2:provided +[INFO] | | | +- xmlenc:xmlenc:jar:0.52:provided +[INFO] | | | +- commons-httpclient:commons-httpclient:jar:3.1:provided +[INFO] | | | +- commons-configuration:commons-configuration:jar:1.6:provided +[INFO] | | | | +- commons-digester:commons-digester:jar:1.8:provided +[INFO] | | | | | \- commons-beanutils:commons-beanutils:jar:1.7.0:provided +[INFO] | | | | \- commons-beanutils:commons-beanutils-core:jar:1.8.0:provided +[INFO] | | | +- org.apache.hadoop:hadoop-auth:jar:2.6.5:provided +[INFO] | | | | \- org.apache.directory.server:apacheds-kerberos-codec:jar:2.0.0-M15:provided +[INFO] | | | | +- org.apache.directory.server:apacheds-i18n:jar:2.0.0-M15:provided +[INFO] | | | | +- org.apache.directory.api:api-asn1-api:jar:1.0.0-M20:provided +[INFO] | | | | \- org.apache.directory.api:api-util:jar:1.0.0-M20:provided +[INFO] | | | +- org.apache.curator:curator-client:jar:2.6.0:provided +[INFO] | | | \- org.htrace:htrace-core:jar:3.0.4:provided +[INFO] | | +- org.apache.hadoop:hadoop-hdfs:jar:2.6.5:provided +[INFO] | | | +- org.mortbay.jetty:jetty-util:jar:6.1.26:provided +[INFO] | | | \- xerces:xercesImpl:jar:2.9.1:provided +[INFO] | | | \- xml-apis:xml-apis:jar:1.3.04:provided +[INFO] | | +- org.apache.hadoop:hadoop-mapreduce-client-app:jar:2.6.5:provided +[INFO] | | | +- org.apache.hadoop:hadoop-mapreduce-client-common:jar:2.6.5:provided +[INFO] | | | | +- org.apache.hadoop:hadoop-yarn-client:jar:2.6.5:provided +[INFO] | | | | \- org.apache.hadoop:hadoop-yarn-server-common:jar:2.6.5:provided +[INFO] | | | \- org.apache.hadoop:hadoop-mapreduce-client-shuffle:jar:2.6.5:provided +[INFO] | | +- org.apache.hadoop:hadoop-yarn-api:jar:2.6.5:provided +[INFO] | | +- org.apache.hadoop:hadoop-mapreduce-client-core:jar:2.6.5:provided +[INFO] | | | \- org.apache.hadoop:hadoop-yarn-common:jar:2.6.5:provided +[INFO] | | | +- javax.xml.bind:jaxb-api:jar:2.2.2:provided +[INFO] | | | | \- javax.xml.stream:stax-api:jar:1.0-2:provided +[INFO] | | | +- org.codehaus.jackson:jackson-jaxrs:jar:1.9.13:provided +[INFO] | | | \- org.codehaus.jackson:jackson-xc:jar:1.9.13:provided +[INFO] | | +- org.apache.hadoop:hadoop-mapreduce-client-jobclient:jar:2.6.5:provided +[INFO] | | \- org.apache.hadoop:hadoop-annotations:jar:2.6.5:provided +[INFO] | +- org.apache.spark:spark-launcher_2.11:jar:2.2.0:provided +[INFO] | +- org.apache.spark:spark-network-common_2.11:jar:2.2.0:provided +[INFO] | | \- org.fusesource.leveldbjni:leveldbjni-all:jar:1.8:provided +[INFO] | +- org.apache.spark:spark-network-shuffle_2.11:jar:2.2.0:provided +[INFO] | +- org.apache.spark:spark-unsafe_2.11:jar:2.2.0:provided +[INFO] | +- net.java.dev.jets3t:jets3t:jar:0.9.3:provided +[INFO] | | +- org.apache.httpcomponents:httpcore:jar:4.3.3:provided +[INFO] | | +- org.apache.httpcomponents:httpclient:jar:4.3.6:provided +[INFO] | | +- javax.activation:activation:jar:1.1.1:provided +[INFO] | | +- mx4j:mx4j:jar:3.0.2:provided +[INFO] | | +- javax.mail:mail:jar:1.4.7:provided +[INFO] | | +- org.bouncycastle:bcprov-jdk15on:jar:1.51:provided +[INFO] | | \- com.jamesmurty.utils:java-xmlbuilder:jar:1.0:provided +[INFO] | | \- net.iharder:base64:jar:2.3.8:provided +[INFO] | +- org.apache.curator:curator-recipes:jar:2.6.0:provided +[INFO] | | +- org.apache.curator:curator-framework:jar:2.6.0:provided +[INFO] | | \- org.apache.zookeeper:zookeeper:jar:3.4.6:provided +[INFO] | +- javax.servlet:javax.servlet-api:jar:3.1.0:provided +[INFO] | +- org.apache.commons:commons-lang3:jar:3.5:provided +[INFO] | +- com.google.code.findbugs:jsr305:jar:1.3.9:provided +[INFO] | +- org.slf4j:slf4j-api:jar:1.7.16:provided +[INFO] | +- org.slf4j:jul-to-slf4j:jar:1.7.16:provided +[INFO] | +- org.slf4j:jcl-over-slf4j:jar:1.7.16:provided +[INFO] | +- org.slf4j:slf4j-log4j12:jar:1.7.16:provided +[INFO] | +- com.ning:compress-lzf:jar:1.0.3:provided +[INFO] | +- org.xerial.snappy:snappy-java:jar:1.1.2.6:provided +[INFO] | +- net.jpountz.lz4:lz4:jar:1.3.0:provided +[INFO] | +- org.roaringbitmap:RoaringBitmap:jar:0.5.11:provided +[INFO] | +- commons-net:commons-net:jar:2.2:provided +[INFO] | +- org.scala-lang:scala-library:jar:2.11.8:provided +[INFO] | +- org.json4s:json4s-jackson_2.11:jar:3.2.11:provided +[INFO] | | \- org.json4s:json4s-core_2.11:jar:3.2.11:provided +[INFO] | | +- org.json4s:json4s-ast_2.11:jar:3.2.11:provided +[INFO] | | \- org.scala-lang:scalap:jar:2.11.0:provided +[INFO] | | \- org.scala-lang:scala-compiler:jar:2.11.0:provided +[INFO] | | +- org.scala-lang.modules:scala-xml_2.11:jar:1.0.1:provided +[INFO] | | \- org.scala-lang.modules:scala-parser-combinators_2.11:jar:1.0.1:provided +[INFO] | +- org.glassfish.jersey.core:jersey-client:jar:2.22.2:provided +[INFO] | | +- javax.ws.rs:javax.ws.rs-api:jar:2.0.1:provided +[INFO] | | +- org.glassfish.hk2:hk2-api:jar:2.4.0-b34:provided +[INFO] | | | +- org.glassfish.hk2:hk2-utils:jar:2.4.0-b34:provided +[INFO] | | | \- org.glassfish.hk2.external:aopalliance-repackaged:jar:2.4.0-b34:provided +[INFO] | | +- org.glassfish.hk2.external:javax.inject:jar:2.4.0-b34:provided +[INFO] | | \- org.glassfish.hk2:hk2-locator:jar:2.4.0-b34:provided +[INFO] | +- org.glassfish.jersey.core:jersey-common:jar:2.22.2:provided +[INFO] | | +- javax.annotation:javax.annotation-api:jar:1.2:provided +[INFO] | | +- org.glassfish.jersey.bundles.repackaged:jersey-guava:jar:2.22.2:provided +[INFO] | | \- org.glassfish.hk2:osgi-resource-locator:jar:1.0.1:provided +[INFO] | +- org.glassfish.jersey.core:jersey-server:jar:2.22.2:provided +[INFO] | | +- org.glassfish.jersey.media:jersey-media-jaxb:jar:2.22.2:provided +[INFO] | | \- javax.validation:validation-api:jar:1.1.0.Final:provided +[INFO] | +- org.glassfish.jersey.containers:jersey-container-servlet:jar:2.22.2:provided +[INFO] | +- org.glassfish.jersey.containers:jersey-container-servlet-core:jar:2.22.2:provided +[INFO] | +- io.netty:netty-all:jar:4.0.43.Final:provided +[INFO] | +- io.netty:netty:jar:3.9.9.Final:provided +[INFO] | +- com.clearspring.analytics:stream:jar:2.7.0:provided +[INFO] | +- io.dropwizard.metrics:metrics-core:jar:3.1.2:provided +[INFO] | +- io.dropwizard.metrics:metrics-jvm:jar:3.1.2:provided +[INFO] | +- io.dropwizard.metrics:metrics-json:jar:3.1.2:provided +[INFO] | +- io.dropwizard.metrics:metrics-graphite:jar:3.1.2:provided +[INFO] | +- com.fasterxml.jackson.module:jackson-module-scala_2.11:jar:2.6.5:provided +[INFO] | | +- org.scala-lang:scala-reflect:jar:2.11.7:provided +[INFO] | | \- com.fasterxml.jackson.module:jackson-module-paranamer:jar:2.6.5:provided +[INFO] | +- org.apache.ivy:ivy:jar:2.4.0:provided +[INFO] | +- oro:oro:jar:2.0.8:provided +[INFO] | +- net.razorvine:pyrolite:jar:4.13:provided +[INFO] | +- net.sf.py4j:py4j:jar:0.10.4:provided +[INFO] | +- org.apache.spark:spark-tags_2.11:jar:2.2.0:provided +[INFO] | +- org.apache.commons:commons-crypto:jar:1.0.0:provided +[INFO] | \- org.spark-project.spark:unused:jar:1.0.0:provided +[INFO] +- org.apache.spark:spark-graphx_2.11:jar:2.2.0:provided +[INFO] | +- org.apache.spark:spark-mllib-local_2.11:jar:2.2.0:provided +[INFO] | | \- org.scalanlp:breeze_2.11:jar:0.13.1:provided +[INFO] | | +- org.scalanlp:breeze-macros_2.11:jar:0.13.1:provided +[INFO] | | +- net.sf.opencsv:opencsv:jar:2.3:provided +[INFO] | | +- com.github.rwl:jtransforms:jar:2.4.0:provided +[INFO] | | +- org.spire-math:spire_2.11:jar:0.13.0:provided +[INFO] | | | +- org.spire-math:spire-macros_2.11:jar:0.13.0:provided +[INFO] | | | \- org.typelevel:machinist_2.11:jar:0.6.1:provided +[INFO] | | \- com.chuusai:shapeless_2.11:jar:2.3.2:provided +[INFO] | | \- org.typelevel:macro-compat_2.11:jar:1.1.1:provided +[INFO] | +- com.github.fommil.netlib:core:jar:1.1.2:provided +[INFO] | \- net.sourceforge.f2j:arpack_combined_all:jar:0.1:provided +[INFO] +- org.apache.spark:spark-sql_2.11:jar:2.2.0:provided +[INFO] | +- com.univocity:univocity-parsers:jar:2.2.1:provided +[INFO] | +- org.apache.spark:spark-sketch_2.11:jar:2.2.0:provided +[INFO] | +- org.apache.spark:spark-catalyst_2.11:jar:2.2.0:provided +[INFO] | | +- org.codehaus.janino:janino:jar:3.0.0:provided +[INFO] | | +- org.codehaus.janino:commons-compiler:jar:3.0.0:provided +[INFO] | | \- org.antlr:antlr4-runtime:jar:4.5.3:provided +[INFO] | +- org.apache.parquet:parquet-column:jar:1.8.2:provided +[INFO] | | +- org.apache.parquet:parquet-common:jar:1.8.2:provided +[INFO] | | \- org.apache.parquet:parquet-encoding:jar:1.8.2:provided +[INFO] | \- org.apache.parquet:parquet-hadoop:jar:1.8.2:provided +[INFO] | +- org.apache.parquet:parquet-format:jar:2.3.1:provided +[INFO] | \- org.apache.parquet:parquet-jackson:jar:1.8.2:provided +[INFO] +- eu.dnetlib:dnet-openaireplus-mapping-utils:jar:6.2.18:test +[INFO] | +- com.ximpleware:vtd-xml:jar:2.13.4:test (version selected from constraint [2.12,3.0.0)) +[INFO] | +- commons-codec:commons-codec:jar:1.9:provided +[INFO] | +- dom4j:dom4j:jar:1.6.1:test (version selected from constraint [1.6.1,1.6.1]) +[INFO] | +- net.sf.supercsv:super-csv:jar:2.4.0:test +[INFO] | +- eu.dnetlib:cnr-misc-utils:jar:1.0.6-SNAPSHOT:test (version selected from constraint [1.0.0,2.0.0)) +[INFO] | | +- jaxen:jaxen:jar:1.1.6:test +[INFO] | | +- saxonica:saxon:jar:9.1.0.8:test +[INFO] | | +- saxonica:saxon-dom:jar:9.1.0.8:test +[INFO] | | +- jgrapht:jgrapht:jar:0.7.2:test +[INFO] | | +- net.sf.ehcache:ehcache:jar:2.8.0:test +[INFO] | | \- org.springframework:spring-test:jar:4.2.5.RELEASE:test (version selected from constraint [4.2.5.RELEASE,4.2.5.RELEASE]) +[INFO] | | \- org.springframework:spring-core:jar:4.2.5.RELEASE:test +[INFO] | +- eu.dnetlib:dnet-hadoop-commons:jar:2.0.2-SNAPSHOT:test (version selected from constraint [2.0.0,3.0.0)) +[INFO] | | +- org.apache.hadoop:hadoop-core:jar:2.0.0-mr1-cdh4.7.0:test +[INFO] | | | +- commons-el:commons-el:jar:1.0:test +[INFO] | | | \- hsqldb:hsqldb:jar:1.8.0.10:test +[INFO] | | \- org.springframework:spring-beans:jar:4.2.5.RELEASE:test (version selected from constraint [4.2.5.RELEASE,4.2.5.RELEASE]) +[INFO] | \- eu.dnetlib:dnet-index-solr-common:jar:1.3.1:test (version selected from constraint [1.0.0,1.3.1]) +[INFO] | \- org.apache.solr:solr-solrj:jar:4.9.0:test +[INFO] | +- org.apache.httpcomponents:httpmime:jar:4.3.1:test +[INFO] | \- org.noggit:noggit:jar:0.5:test +[INFO] \- junit:junit:jar:4.9:test +[INFO] \- org.hamcrest:hamcrest-core:jar:1.1:test +[INFO] ------------------------------------------------------------------------ +[INFO] Reactor Summary: +[INFO] +[INFO] dnet-dedup 3.0.3-SNAPSHOT .......................... SUCCESS [ 1.152 s] +[INFO] dnet-pace-core ..................................... SUCCESS [ 0.117 s] +[INFO] dnet-dedup-test 3.0.3-SNAPSHOT ..................... SUCCESS [ 1.407 s] +[INFO] ------------------------------------------------------------------------ +[INFO] BUILD SUCCESS +[INFO] ------------------------------------------------------------------------ +[INFO] Total time: 3.216 s +[INFO] Finished at: 2019-03-29T15:02:42+01:00 +[INFO] ------------------------------------------------------------------------ diff --git a/dnet-dedup-test/.DS_Store b/dnet-dedup-test/.DS_Store index 638b0d7026a296895fb31efe27e5047a6d991ff3..3a130ba291d1761dc6dd3e3a27538aef719692a9 100644 GIT binary patch delta 53 zcmZoMXfc@J&&V_}zo{JeAq1_s8B8{OD9 Jvvd6A2LO!j4)_27 delta 28 icmZoMXfc@J&&W72z#2&O$ZR~~%s%mg%w~3uzx)7&#|f(d diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 00d1a45..79ce026 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -14,16 +14,17 @@ jar - src/main/java org.apache.maven.plugins maven-deploy-plugin 2.7 - - - + + true + + + org.apache.maven.plugins maven-compiler-plugin @@ -31,11 +32,50 @@ 1.8 1.8 - src/main/java/**/*.java - src/main/java/**/*.scala + **/*.java + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + + + + + + + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + @@ -78,12 +118,22 @@ test + + com.fasterxml.jackson.core + jackson-databind + + org.apache.oozie oozie-client test + + org.scala-lang + scala-library + + \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/.DS_Store b/dnet-dedup-test/src/main/java/eu/dnetlib/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..e3c5536e2ac2ca6dfd8fd97aa8bb45169093919f GIT binary patch literal 6148 zcmeHKI|>3Z5S{S@f{mqRuHX%V=n3`$3W64*B51vp=kjR2`83O7r-kwcCNG)HOUNsB zc0@#%x7|WyCL$BKp*(Epo9&zTY>*KJ!g0po-ds<|^J(A9eHSq9SoU(2tz0^M+o4eb zDnJFO02QDDpH?6%>}2}sgLxhmpaMUyfPEhd+^{CLf&S^h;4J`fgs>at-b( dataRDD = context.textFile(dataset.getPath()); + + counter = new SparkCounter(context); + + //read the configuration from the classpath + final DedupConfig config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/org.curr.conf")); + + BlockProcessor.constructAccumulator(config); + BlockProcessor.accumulators.forEach(acc -> { + + final String[] values = acc.split("::"); + counter.incrementCounter(values[0], values[1], 0); + + }); + + //create vertexes of the graph: + JavaPairRDD mapDocs = dataRDD.mapToPair(it -> { + MapDocument mapDocument = PaceUtils.asMapDocument(config, it); + return new Tuple2<>(mapDocument.getIdentifier(), mapDocument); + }); + RDD> vertexes = mapDocs.mapToPair(t -> new Tuple2( (long) t._1().hashCode(), t._2())).rdd(); + + //create relations between documents + JavaPairRDD> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id + //Clustering: from to List + .flatMapToPair(a -> { + final MapDocument currentDocument = a._2(); + + return Utility.getGroupingKeys(config, currentDocument).stream() + .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator(); + }).groupByKey();//group documents basing on the key + + //print blocks + blocks.foreach(b -> { + String print = b._1() + ": "; + for (MapDocument doc : b._2()) { + print += doc.getIdentifier() + " "; + } + System.out.println(print); + }); + + //create relations by comparing only elements in the same group + final JavaPairRDD relationRDD = blocks.flatMapToPair(it -> { + final SparkReporter reporter = new SparkReporter(counter); + new BlockProcessor(config).process(it._1(), it._2(), reporter); + return reporter.getReport().iterator(); + }); + + final RDD> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd(); + + JavaRDD ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); + + final JavaRDD connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1); + final JavaRDD nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1); + + System.out.println("Non duplicates: " + nonDeduplicated.count()); + System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count()); + System.out.println("Connected Components: " + connectedComponents.count()); + + counter.getAccumulators().values().forEach(it-> System.out.println(it.getGroup()+" "+it.getName()+" -->"+it.value())); + + //print deduped + connectedComponents.foreach(cc -> { + System.out.println("cc = " + cc.getId()); + for (MapDocument doc: cc.getDocs()) { + System.out.println(doc.getIdentifier() + "; ln: " + doc.getFieldMap().get("legalname").stringValue() + "; sn: " + doc.getFieldMap().get("legalshortname").stringValue()); + } + }); + //print nondeduped + nonDeduplicated.foreach(cc -> { + System.out.println("nd = " + cc.getId()); + System.out.println(cc.getDocs().iterator().next().getFieldMap().get("legalname").stringValue() + "; sn: " + cc.getDocs().iterator().next().getFieldMap().get("legalshortname").stringValue()); + }); + + //print ids +//// ccs.foreach(cc -> System.out.println(cc.getId())); +//// connectedComponents.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup"); + + } + +} \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java index aa4dcd6..b133e4c 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java @@ -1,42 +1,41 @@ package eu.dnetlib; -import com.google.common.collect.Sets; import eu.dnetlib.graph.GraphProcessor; -import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; import eu.dnetlib.pace.utils.PaceUtils; import eu.dnetlib.reporter.SparkCounter; import eu.dnetlib.reporter.SparkReporter; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.graphx.Edge; import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.SparkSession; import scala.Tuple2; import java.io.IOException; -import java.io.StringWriter; -import java.net.URL; -import java.util.Set; import java.util.stream.Collectors; public class SparkTest { public static SparkCounter counter ; - public static void main(String[] args) { - final JavaSparkContext context = new JavaSparkContext(new SparkConf().setAppName("Deduplication").setMaster("yarn")); + public static void main(String[] args) throws IOException { - final URL dataset = SparkTest.class.getResource(args[1]); - final JavaRDD dataRDD = context.textFile(dataset.getPath()); + final SparkSession spark = SparkSession + .builder() + .appName("Deduplication") + .master("yarn") + .getOrCreate(); + + final JavaSparkContext context = new JavaSparkContext(spark.sparkContext()); + + final JavaRDD dataRDD = Utility.loadDataFromHDFS(args[0], context); counter = new SparkCounter(context); - //read the configuration from the classpath - final DedupConfig config = DedupConfig.load(readFromClasspath(args[0])); + final DedupConfig config = Utility.loadConfigFromHDFS(args[1]); BlockProcessor.constructAccumulator(config); BlockProcessor.accumulators.forEach(acc -> { @@ -59,7 +58,7 @@ public class SparkTest { .flatMapToPair(a -> { final MapDocument currentDocument = a._2(); - return getGroupingKeys(config, currentDocument).stream() + return Utility.getGroupingKeys(config, currentDocument).stream() .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator(); }).groupByKey();//group documents basing on the key @@ -105,24 +104,10 @@ public class SparkTest { System.out.println(cc.getDocs().iterator().next().getFieldMap().get("legalname").stringValue() + "; sn: " + cc.getDocs().iterator().next().getFieldMap().get("legalshortname").stringValue()); }); - //print ids -//// ccs.foreach(cc -> System.out.println(cc.getId())); -//// connectedComponents.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup"); +// print ids +// ccs.foreach(cc -> System.out.println(cc.getId())); +// connectedComponents.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup"); } - static String readFromClasspath(final String filename) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(SparkTest.class.getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } - - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { - return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); - } - } \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/Utility.java b/dnet-dedup-test/src/main/java/eu/dnetlib/Utility.java new file mode 100644 index 0000000..ebab8dc --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Utility.java @@ -0,0 +1,50 @@ +package eu.dnetlib; + +import com.google.common.collect.Sets; +import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import java.io.IOException; +import java.io.StringWriter; +import java.nio.charset.StandardCharsets; +import java.util.Set; + +public class Utility { + + public static JavaRDD loadDataFromHDFS(String path, JavaSparkContext context) { + return context.textFile(path); + } + + public static DedupConfig loadConfigFromHDFS(String path) throws IOException { + + Configuration conf = new Configuration(); +// conf.set("fs.defaultFS", ""); + FileSystem fileSystem = FileSystem.get(conf); + FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); + + return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); + + } + + static String readFromClasspath(final String filename) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(SparkTest.class.getResourceAsStream(filename), sw); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } + + static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { + return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); + } +} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java index 3b9ff1b..77a247f 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java @@ -6,8 +6,7 @@ import org.apache.oozie.client.OozieClientException; import org.apache.oozie.client.WorkflowJob; import org.junit.Test; -import java.io.IOException; -import java.io.StringWriter; +import java.io.*; import java.util.Properties; import static junit.framework.Assert.assertEquals; diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/SparkTester.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/SparkTester.java new file mode 100644 index 0000000..7402c86 --- /dev/null +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/SparkTester.java @@ -0,0 +1,17 @@ +package eu.dnetlib.pace; + +import eu.dnetlib.SparkLocalTest; +import org.junit.Test; + +import java.io.IOException; + +public class SparkTester { + + @Test + public void sparkLocalTest() throws IOException { + + SparkLocalTest.main(new String[]{}); + } + + +} diff --git a/dnet-dedup.ipr b/dnet-dedup.ipr new file mode 100644 index 0000000..dc3974c --- /dev/null +++ b/dnet-dedup.ipr @@ -0,0 +1,109 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dnet-dedup.iws b/dnet-dedup.iws new file mode 100644 index 0000000..57de9a0 --- /dev/null +++ b/dnet-dedup.iws @@ -0,0 +1,418 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dnet-openaire-data-protos/.DS_Store b/dnet-openaire-data-protos/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..638b0d7026a296895fb31efe27e5047a6d991ff3 GIT binary patch literal 6148 zcmeHKJ8l9o5S<|it(0a9^us9E~@hB50G26hv>N@w3PC zZ21*CJ0hapdG{vLibw-Dl#2~hvwic4%`&1uIL_G1u>0B`K90x3B>U%paThYkMo#ki zoo^d7DnJFO02QDDRN&zXWQlEFAAT|qqykjn*%h$wLxCIC#4*r69T>a?0L~G1!`yob zV6gzOCXRuKz%;19plY@l8g#@<=GDY8FzBM$d}!XR*`cW4j`NGBi`GDnRDcTHEASG_ z*6RNX{!9OVPvVLSP=SA?fX@5(zQdEUwoV>rwYI?5aLc*E%`kTg1~11zFUMF|IbM2F bPDkA7K>iGvE;K6e*9zPK5X%*r literal 0 HcmV?d00001 diff --git a/dnet-openaire-data-protos/pom.xml b/dnet-openaire-data-protos/pom.xml index 47ed2ea..f247374 100644 --- a/dnet-openaire-data-protos/pom.xml +++ b/dnet-openaire-data-protos/pom.xml @@ -10,7 +10,7 @@ eu.dnetlib dnet-openaire-data-protos jar - 3.9.4-CUSTOM + 3.9.4-proto250 diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index d88dea4..925a613 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -64,6 +64,7 @@ com.fasterxml.jackson.core jackson-databind + org.codehaus.jackson jackson-mapper-asl diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index b954df7..6b85cf4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -1,19 +1,16 @@ package eu.dnetlib.pace.model; -import java.io.Serializable; -import java.lang.reflect.InvocationTargetException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.gson.Gson; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.distance.*; -import eu.dnetlib.pace.distance.algo.*; -import eu.dnetlib.pace.util.PaceException; +import eu.dnetlib.pace.distance.DistanceAlgo; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm. @@ -60,6 +57,18 @@ public class FieldDef implements Serializable { return name; } + public void setName(String name) { + this.name = name; + } + + public void setPath(String path) { + this.path = path; + } + + public void setIgnoreMissing(boolean ignoreMissing) { + this.ignoreMissing = ignoreMissing; + } + public String getPath() { return path; } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index d47768b..575b3c7 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -32,7 +32,7 @@ public class ConfigTest extends AbstractPaceTest { @Test public void dedupConfigTest() { - DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf.json")); + DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf")); System.out.println(load.toString()); } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf new file mode 100644 index 0000000..fd4fbbe --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf @@ -0,0 +1,36 @@ +{ + "wf" : { + "threshold" : "0.9", + "dedupRun" : "001", + "entityType" : "organization", + "orderField" : "legalname", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "gridid" ] } + ], + "conditions" : [ + { "name" : "exactMatch", "fields" : [ "country" ] }, + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } + ], + "model" : [ + { "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } + ], + "blacklists" : { } + } +} \ No newline at end of file diff --git a/pom.xml b/pom.xml index b7f7992..9783146 100644 --- a/pom.xml +++ b/pom.xml @@ -35,6 +35,8 @@ https://issue.openaire.research-infrastructures.eu/projects/openaire + + dnet45-releases @@ -70,6 +72,18 @@ + + cloudera + Cloudera Repository + https://repository.cloudera.com/artifactory/cloudera-repos + + true + + + false + + + @@ -77,22 +91,125 @@ target/classes ${project.artifactId}-${project.version} target/test-classes + + + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.0 + + 1.8 + 1.8 + ${project.build.sourceEncoding} + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + + + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + verify + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.19.1 + + true + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + true + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.0.0 + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.13 + + + integration-test + + integration-test + + + + verify + + verify + + + + + + + + + org.apache.maven.plugins - maven-compiler-plugin - 3.6.0 - - 1.8 - 1.8 - ${project.build.sourceEncoding} - - - - org.apache.maven.plugins - maven-dependency-plugin - 3.0.0 + maven-release-plugin + 2.5.3 + + + + + + + + + + + + + + + + + + + + + + + + @@ -113,7 +230,7 @@ - + @@ -137,7 +254,7 @@ eu.dnetlib dnet-openaire-data-protos - 3.9.4-CUSTOM + 3.9.3-proto250 eu.dnetlib @@ -148,8 +265,9 @@ com.fasterxml.jackson.core jackson-databind - 2.6.6 + ${jackson.version} + org.codehaus.jackson jackson-mapper-asl @@ -196,16 +314,19 @@ org.apache.spark spark-core_2.11 ${spark.version} + compile org.apache.spark spark-graphx_2.11 ${spark.version} + compile org.apache.spark spark-sql_2.11 ${spark.version} + compile junit @@ -219,6 +340,12 @@ 0.9.10 + + org.scala-lang + scala-library + ${scala.version} + + org.apache.oozie oozie-client @@ -227,6 +354,7 @@ + UTF-8 @@ -236,6 +364,7 @@ 15.0 2.2.0 + 2.6.6 2.6 2.4 @@ -243,6 +372,7 @@ 1.1.3 4.9 + 2.11.8 false