diff --git a/dependencies.txt b/dependencies.txt deleted file mode 100644 index 29c11c16d..000000000 --- a/dependencies.txt +++ /dev/null @@ -1,252 +0,0 @@ -[INFO] Scanning for projects... -[INFO] ------------------------------------------------------------------------ -[INFO] Reactor Build Order: -[INFO] -[INFO] dnet-dedup [pom] -[INFO] dnet-pace-core [jar] -[INFO] dnet-dedup-test [jar] -[INFO] -[INFO] -----------------------< eu.dnetlib:dnet-dedup >------------------------ -[INFO] Building dnet-dedup 3.0.3-SNAPSHOT [1/3] -[INFO] --------------------------------[ pom ]--------------------------------- -[INFO] -[INFO] --- maven-dependency-plugin:3.0.0:tree (default-cli) @ dnet-dedup --- -[INFO] eu.dnetlib:dnet-dedup:pom:3.0.3-SNAPSHOT -[INFO] -[INFO] ---------------------< eu.dnetlib:dnet-pace-core >---------------------- -[INFO] Building dnet-pace-core 3.0.3-SNAPSHOT [2/3] -[INFO] --------------------------------[ jar ]--------------------------------- -[INFO] -[INFO] --- maven-dependency-plugin:3.0.0:tree (default-cli) @ dnet-pace-core --- -[INFO] eu.dnetlib:dnet-pace-core:jar:3.0.3-SNAPSHOT -[INFO] +- edu.cmu:secondstring:jar:1.0.0:compile -[INFO] +- com.google.guava:guava:jar:15.0:compile -[INFO] +- com.google.code.gson:gson:jar:2.2.2:compile -[INFO] +- commons-lang:commons-lang:jar:2.6:compile -[INFO] +- commons-io:commons-io:jar:2.4:compile -[INFO] +- commons-collections:commons-collections:jar:3.2.1:compile -[INFO] +- com.googlecode.protobuf-java-format:protobuf-java-format:jar:1.2:compile -[INFO] +- org.antlr:stringtemplate:jar:3.2:compile -[INFO] | \- org.antlr:antlr:jar:2.7.7:compile -[INFO] +- commons-logging:commons-logging:jar:1.1.3:compile -[INFO] +- junit:junit:jar:4.9:test -[INFO] | \- org.hamcrest:hamcrest-core:jar:1.1:test -[INFO] +- org.reflections:reflections:jar:0.9.10:compile -[INFO] | +- org.javassist:javassist:jar:3.19.0-GA:compile -[INFO] | \- com.google.code.findbugs:annotations:jar:2.0.1:compile -[INFO] +- com.fasterxml.jackson.core:jackson-databind:jar:2.6.6:compile -[INFO] | +- com.fasterxml.jackson.core:jackson-annotations:jar:2.6.0:compile -[INFO] | \- com.fasterxml.jackson.core:jackson-core:jar:2.6.6:compile -[INFO] +- org.codehaus.jackson:jackson-mapper-asl:jar:1.9.13:compile -[INFO] | \- org.codehaus.jackson:jackson-core-asl:jar:1.9.13:compile -[INFO] \- org.apache.commons:commons-math3:jar:3.6.1:compile -[INFO] -[INFO] ---------------------< eu.dnetlib:dnet-dedup-test >--------------------- -[INFO] Building dnet-dedup-test 3.0.3-SNAPSHOT [3/3] -[INFO] --------------------------------[ jar ]--------------------------------- -[INFO] -[INFO] --- maven-dependency-plugin:3.0.0:tree (default-cli) @ dnet-dedup-test --- -[INFO] eu.dnetlib:dnet-dedup-test:jar:3.0.3-SNAPSHOT -[INFO] +- eu.dnetlib:dnet-pace-core:jar:3.0.3-SNAPSHOT:compile -[INFO] | +- edu.cmu:secondstring:jar:1.0.0:compile -[INFO] | +- com.google.guava:guava:jar:15.0:compile -[INFO] | +- com.google.code.gson:gson:jar:2.2.2:compile -[INFO] | +- commons-lang:commons-lang:jar:2.6:compile -[INFO] | +- commons-io:commons-io:jar:2.4:compile -[INFO] | +- commons-collections:commons-collections:jar:3.2.1:compile -[INFO] | +- com.googlecode.protobuf-java-format:protobuf-java-format:jar:1.2:compile -[INFO] | +- org.antlr:stringtemplate:jar:3.2:compile -[INFO] | | \- org.antlr:antlr:jar:2.7.7:compile -[INFO] | +- commons-logging:commons-logging:jar:1.1.3:compile -[INFO] | +- org.reflections:reflections:jar:0.9.10:compile -[INFO] | | +- org.javassist:javassist:jar:3.19.0-GA:compile -[INFO] | | \- com.google.code.findbugs:annotations:jar:2.0.1:compile -[INFO] | +- com.fasterxml.jackson.core:jackson-databind:jar:2.6.6:compile -[INFO] | | +- com.fasterxml.jackson.core:jackson-annotations:jar:2.6.0:compile -[INFO] | | \- com.fasterxml.jackson.core:jackson-core:jar:2.6.6:compile -[INFO] | +- org.codehaus.jackson:jackson-mapper-asl:jar:1.9.13:compile -[INFO] | | \- org.codehaus.jackson:jackson-core-asl:jar:1.9.13:compile -[INFO] | \- org.apache.commons:commons-math3:jar:3.6.1:compile -[INFO] +- eu.dnetlib:dnet-openaire-data-protos:jar:3.9.3-proto250:compile -[INFO] | +- com.google.protobuf:protobuf-java:jar:2.5.0:compile -[INFO] | \- log4j:log4j:jar:1.2.17:compile (version selected from constraint [1.2.17,1.2.17]) -[INFO] +- org.apache.spark:spark-core_2.11:jar:2.2.0:provided -[INFO] | +- org.apache.avro:avro:jar:1.7.7:provided -[INFO] | | +- com.thoughtworks.paranamer:paranamer:jar:2.3:provided -[INFO] | | \- org.apache.commons:commons-compress:jar:1.4.1:provided -[INFO] | | \- org.tukaani:xz:jar:1.0:provided -[INFO] | +- org.apache.avro:avro-mapred:jar:hadoop2:1.7.7:provided -[INFO] | | +- org.apache.avro:avro-ipc:jar:1.7.7:provided -[INFO] | | \- org.apache.avro:avro-ipc:jar:tests:1.7.7:provided -[INFO] | +- com.twitter:chill_2.11:jar:0.8.0:provided -[INFO] | | \- com.esotericsoftware:kryo-shaded:jar:3.0.3:provided -[INFO] | | +- com.esotericsoftware:minlog:jar:1.3.0:provided -[INFO] | | \- org.objenesis:objenesis:jar:2.1:provided -[INFO] | +- com.twitter:chill-java:jar:0.8.0:provided -[INFO] | +- org.apache.xbean:xbean-asm5-shaded:jar:4.4:provided -[INFO] | +- org.apache.hadoop:hadoop-client:jar:2.6.5:provided -[INFO] | | +- org.apache.hadoop:hadoop-common:jar:2.6.5:provided -[INFO] | | | +- commons-cli:commons-cli:jar:1.2:provided -[INFO] | | | +- xmlenc:xmlenc:jar:0.52:provided -[INFO] | | | +- commons-httpclient:commons-httpclient:jar:3.1:provided -[INFO] | | | +- commons-configuration:commons-configuration:jar:1.6:provided -[INFO] | | | | +- commons-digester:commons-digester:jar:1.8:provided -[INFO] | | | | | \- commons-beanutils:commons-beanutils:jar:1.7.0:provided -[INFO] | | | | \- commons-beanutils:commons-beanutils-core:jar:1.8.0:provided -[INFO] | | | +- org.apache.hadoop:hadoop-auth:jar:2.6.5:provided -[INFO] | | | | \- org.apache.directory.server:apacheds-kerberos-codec:jar:2.0.0-M15:provided -[INFO] | | | | +- org.apache.directory.server:apacheds-i18n:jar:2.0.0-M15:provided -[INFO] | | | | +- org.apache.directory.api:api-asn1-api:jar:1.0.0-M20:provided -[INFO] | | | | \- org.apache.directory.api:api-util:jar:1.0.0-M20:provided -[INFO] | | | +- org.apache.curator:curator-client:jar:2.6.0:provided -[INFO] | | | \- org.htrace:htrace-core:jar:3.0.4:provided -[INFO] | | +- org.apache.hadoop:hadoop-hdfs:jar:2.6.5:provided -[INFO] | | | +- org.mortbay.jetty:jetty-util:jar:6.1.26:provided -[INFO] | | | \- xerces:xercesImpl:jar:2.9.1:provided -[INFO] | | | \- xml-apis:xml-apis:jar:1.3.04:provided -[INFO] | | +- org.apache.hadoop:hadoop-mapreduce-client-app:jar:2.6.5:provided -[INFO] | | | +- org.apache.hadoop:hadoop-mapreduce-client-common:jar:2.6.5:provided -[INFO] | | | | +- org.apache.hadoop:hadoop-yarn-client:jar:2.6.5:provided -[INFO] | | | | \- org.apache.hadoop:hadoop-yarn-server-common:jar:2.6.5:provided -[INFO] | | | \- org.apache.hadoop:hadoop-mapreduce-client-shuffle:jar:2.6.5:provided -[INFO] | | +- org.apache.hadoop:hadoop-yarn-api:jar:2.6.5:provided -[INFO] | | +- org.apache.hadoop:hadoop-mapreduce-client-core:jar:2.6.5:provided -[INFO] | | | \- org.apache.hadoop:hadoop-yarn-common:jar:2.6.5:provided -[INFO] | | | +- javax.xml.bind:jaxb-api:jar:2.2.2:provided -[INFO] | | | | \- javax.xml.stream:stax-api:jar:1.0-2:provided -[INFO] | | | +- org.codehaus.jackson:jackson-jaxrs:jar:1.9.13:provided -[INFO] | | | \- org.codehaus.jackson:jackson-xc:jar:1.9.13:provided -[INFO] | | +- org.apache.hadoop:hadoop-mapreduce-client-jobclient:jar:2.6.5:provided -[INFO] | | \- org.apache.hadoop:hadoop-annotations:jar:2.6.5:provided -[INFO] | +- org.apache.spark:spark-launcher_2.11:jar:2.2.0:provided -[INFO] | +- org.apache.spark:spark-network-common_2.11:jar:2.2.0:provided -[INFO] | | \- org.fusesource.leveldbjni:leveldbjni-all:jar:1.8:provided -[INFO] | +- org.apache.spark:spark-network-shuffle_2.11:jar:2.2.0:provided -[INFO] | +- org.apache.spark:spark-unsafe_2.11:jar:2.2.0:provided -[INFO] | +- net.java.dev.jets3t:jets3t:jar:0.9.3:provided -[INFO] | | +- org.apache.httpcomponents:httpcore:jar:4.3.3:provided -[INFO] | | +- org.apache.httpcomponents:httpclient:jar:4.3.6:provided -[INFO] | | +- javax.activation:activation:jar:1.1.1:provided -[INFO] | | +- mx4j:mx4j:jar:3.0.2:provided -[INFO] | | +- javax.mail:mail:jar:1.4.7:provided -[INFO] | | +- org.bouncycastle:bcprov-jdk15on:jar:1.51:provided -[INFO] | | \- com.jamesmurty.utils:java-xmlbuilder:jar:1.0:provided -[INFO] | | \- net.iharder:base64:jar:2.3.8:provided -[INFO] | +- org.apache.curator:curator-recipes:jar:2.6.0:provided -[INFO] | | +- org.apache.curator:curator-framework:jar:2.6.0:provided -[INFO] | | \- org.apache.zookeeper:zookeeper:jar:3.4.6:provided -[INFO] | +- javax.servlet:javax.servlet-api:jar:3.1.0:provided -[INFO] | +- org.apache.commons:commons-lang3:jar:3.5:provided -[INFO] | +- com.google.code.findbugs:jsr305:jar:1.3.9:provided -[INFO] | +- org.slf4j:slf4j-api:jar:1.7.16:provided -[INFO] | +- org.slf4j:jul-to-slf4j:jar:1.7.16:provided -[INFO] | +- org.slf4j:jcl-over-slf4j:jar:1.7.16:provided -[INFO] | +- org.slf4j:slf4j-log4j12:jar:1.7.16:provided -[INFO] | +- com.ning:compress-lzf:jar:1.0.3:provided -[INFO] | +- org.xerial.snappy:snappy-java:jar:1.1.2.6:provided -[INFO] | +- net.jpountz.lz4:lz4:jar:1.3.0:provided -[INFO] | +- org.roaringbitmap:RoaringBitmap:jar:0.5.11:provided -[INFO] | +- commons-net:commons-net:jar:2.2:provided -[INFO] | +- org.scala-lang:scala-library:jar:2.11.8:provided -[INFO] | +- org.json4s:json4s-jackson_2.11:jar:3.2.11:provided -[INFO] | | \- org.json4s:json4s-core_2.11:jar:3.2.11:provided -[INFO] | | +- org.json4s:json4s-ast_2.11:jar:3.2.11:provided -[INFO] | | \- org.scala-lang:scalap:jar:2.11.0:provided -[INFO] | | \- org.scala-lang:scala-compiler:jar:2.11.0:provided -[INFO] | | +- org.scala-lang.modules:scala-xml_2.11:jar:1.0.1:provided -[INFO] | | \- org.scala-lang.modules:scala-parser-combinators_2.11:jar:1.0.1:provided -[INFO] | +- org.glassfish.jersey.core:jersey-client:jar:2.22.2:provided -[INFO] | | +- javax.ws.rs:javax.ws.rs-api:jar:2.0.1:provided -[INFO] | | +- org.glassfish.hk2:hk2-api:jar:2.4.0-b34:provided -[INFO] | | | +- org.glassfish.hk2:hk2-utils:jar:2.4.0-b34:provided -[INFO] | | | \- org.glassfish.hk2.external:aopalliance-repackaged:jar:2.4.0-b34:provided -[INFO] | | +- org.glassfish.hk2.external:javax.inject:jar:2.4.0-b34:provided -[INFO] | | \- org.glassfish.hk2:hk2-locator:jar:2.4.0-b34:provided -[INFO] | +- org.glassfish.jersey.core:jersey-common:jar:2.22.2:provided -[INFO] | | +- javax.annotation:javax.annotation-api:jar:1.2:provided -[INFO] | | +- org.glassfish.jersey.bundles.repackaged:jersey-guava:jar:2.22.2:provided -[INFO] | | \- org.glassfish.hk2:osgi-resource-locator:jar:1.0.1:provided -[INFO] | +- org.glassfish.jersey.core:jersey-server:jar:2.22.2:provided -[INFO] | | +- org.glassfish.jersey.media:jersey-media-jaxb:jar:2.22.2:provided -[INFO] | | \- javax.validation:validation-api:jar:1.1.0.Final:provided -[INFO] | +- org.glassfish.jersey.containers:jersey-container-servlet:jar:2.22.2:provided -[INFO] | +- org.glassfish.jersey.containers:jersey-container-servlet-core:jar:2.22.2:provided -[INFO] | +- io.netty:netty-all:jar:4.0.43.Final:provided -[INFO] | +- io.netty:netty:jar:3.9.9.Final:provided -[INFO] | +- com.clearspring.analytics:stream:jar:2.7.0:provided -[INFO] | +- io.dropwizard.metrics:metrics-core:jar:3.1.2:provided -[INFO] | +- io.dropwizard.metrics:metrics-jvm:jar:3.1.2:provided -[INFO] | +- io.dropwizard.metrics:metrics-json:jar:3.1.2:provided -[INFO] | +- io.dropwizard.metrics:metrics-graphite:jar:3.1.2:provided -[INFO] | +- com.fasterxml.jackson.module:jackson-module-scala_2.11:jar:2.6.5:provided -[INFO] | | +- org.scala-lang:scala-reflect:jar:2.11.7:provided -[INFO] | | \- com.fasterxml.jackson.module:jackson-module-paranamer:jar:2.6.5:provided -[INFO] | +- org.apache.ivy:ivy:jar:2.4.0:provided -[INFO] | +- oro:oro:jar:2.0.8:provided -[INFO] | +- net.razorvine:pyrolite:jar:4.13:provided -[INFO] | +- net.sf.py4j:py4j:jar:0.10.4:provided -[INFO] | +- org.apache.spark:spark-tags_2.11:jar:2.2.0:provided -[INFO] | +- org.apache.commons:commons-crypto:jar:1.0.0:provided -[INFO] | \- org.spark-project.spark:unused:jar:1.0.0:provided -[INFO] +- org.apache.spark:spark-graphx_2.11:jar:2.2.0:provided -[INFO] | +- org.apache.spark:spark-mllib-local_2.11:jar:2.2.0:provided -[INFO] | | \- org.scalanlp:breeze_2.11:jar:0.13.1:provided -[INFO] | | +- org.scalanlp:breeze-macros_2.11:jar:0.13.1:provided -[INFO] | | +- net.sf.opencsv:opencsv:jar:2.3:provided -[INFO] | | +- com.github.rwl:jtransforms:jar:2.4.0:provided -[INFO] | | +- org.spire-math:spire_2.11:jar:0.13.0:provided -[INFO] | | | +- org.spire-math:spire-macros_2.11:jar:0.13.0:provided -[INFO] | | | \- org.typelevel:machinist_2.11:jar:0.6.1:provided -[INFO] | | \- com.chuusai:shapeless_2.11:jar:2.3.2:provided -[INFO] | | \- org.typelevel:macro-compat_2.11:jar:1.1.1:provided -[INFO] | +- com.github.fommil.netlib:core:jar:1.1.2:provided -[INFO] | \- net.sourceforge.f2j:arpack_combined_all:jar:0.1:provided -[INFO] +- org.apache.spark:spark-sql_2.11:jar:2.2.0:provided -[INFO] | +- com.univocity:univocity-parsers:jar:2.2.1:provided -[INFO] | +- org.apache.spark:spark-sketch_2.11:jar:2.2.0:provided -[INFO] | +- org.apache.spark:spark-catalyst_2.11:jar:2.2.0:provided -[INFO] | | +- org.codehaus.janino:janino:jar:3.0.0:provided -[INFO] | | +- org.codehaus.janino:commons-compiler:jar:3.0.0:provided -[INFO] | | \- org.antlr:antlr4-runtime:jar:4.5.3:provided -[INFO] | +- org.apache.parquet:parquet-column:jar:1.8.2:provided -[INFO] | | +- org.apache.parquet:parquet-common:jar:1.8.2:provided -[INFO] | | \- org.apache.parquet:parquet-encoding:jar:1.8.2:provided -[INFO] | \- org.apache.parquet:parquet-hadoop:jar:1.8.2:provided -[INFO] | +- org.apache.parquet:parquet-format:jar:2.3.1:provided -[INFO] | \- org.apache.parquet:parquet-jackson:jar:1.8.2:provided -[INFO] +- eu.dnetlib:dnet-openaireplus-mapping-utils:jar:6.2.18:test -[INFO] | +- com.ximpleware:vtd-xml:jar:2.13.4:test (version selected from constraint [2.12,3.0.0)) -[INFO] | +- commons-codec:commons-codec:jar:1.9:provided -[INFO] | +- dom4j:dom4j:jar:1.6.1:test (version selected from constraint [1.6.1,1.6.1]) -[INFO] | +- net.sf.supercsv:super-csv:jar:2.4.0:test -[INFO] | +- eu.dnetlib:cnr-misc-utils:jar:1.0.6-SNAPSHOT:test (version selected from constraint [1.0.0,2.0.0)) -[INFO] | | +- jaxen:jaxen:jar:1.1.6:test -[INFO] | | +- saxonica:saxon:jar:9.1.0.8:test -[INFO] | | +- saxonica:saxon-dom:jar:9.1.0.8:test -[INFO] | | +- jgrapht:jgrapht:jar:0.7.2:test -[INFO] | | +- net.sf.ehcache:ehcache:jar:2.8.0:test -[INFO] | | \- org.springframework:spring-test:jar:4.2.5.RELEASE:test (version selected from constraint [4.2.5.RELEASE,4.2.5.RELEASE]) -[INFO] | | \- org.springframework:spring-core:jar:4.2.5.RELEASE:test -[INFO] | +- eu.dnetlib:dnet-hadoop-commons:jar:2.0.2-SNAPSHOT:test (version selected from constraint [2.0.0,3.0.0)) -[INFO] | | +- org.apache.hadoop:hadoop-core:jar:2.0.0-mr1-cdh4.7.0:test -[INFO] | | | +- commons-el:commons-el:jar:1.0:test -[INFO] | | | \- hsqldb:hsqldb:jar:1.8.0.10:test -[INFO] | | \- org.springframework:spring-beans:jar:4.2.5.RELEASE:test (version selected from constraint [4.2.5.RELEASE,4.2.5.RELEASE]) -[INFO] | \- eu.dnetlib:dnet-index-solr-common:jar:1.3.1:test (version selected from constraint [1.0.0,1.3.1]) -[INFO] | \- org.apache.solr:solr-solrj:jar:4.9.0:test -[INFO] | +- org.apache.httpcomponents:httpmime:jar:4.3.1:test -[INFO] | \- org.noggit:noggit:jar:0.5:test -[INFO] \- junit:junit:jar:4.9:test -[INFO] \- org.hamcrest:hamcrest-core:jar:1.1:test -[INFO] ------------------------------------------------------------------------ -[INFO] Reactor Summary: -[INFO] -[INFO] dnet-dedup 3.0.3-SNAPSHOT .......................... SUCCESS [ 1.152 s] -[INFO] dnet-pace-core ..................................... SUCCESS [ 0.117 s] -[INFO] dnet-dedup-test 3.0.3-SNAPSHOT ..................... SUCCESS [ 1.407 s] -[INFO] ------------------------------------------------------------------------ -[INFO] BUILD SUCCESS -[INFO] ------------------------------------------------------------------------ -[INFO] Total time: 3.216 s -[INFO] Finished at: 2019-03-29T15:02:42+01:00 -[INFO] ------------------------------------------------------------------------ diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index 3c0261376..9955d5fbe 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -22,12 +22,17 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu @Override public Collection apply(List fields) { - return fields.stream() - .filter(f -> !f.isEmpty()) - .map(Field::stringValue) - .map(this::asUrl) - .map(URL::getHost) - .collect(Collectors.toCollection(HashSet::new)); + try { + return fields.stream() + .filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::asUrl) + .map(URL::getHost) + .collect(Collectors.toCollection(HashSet::new)); + } + catch (IllegalStateException e){ + return new HashSet<>(); + } } @Override @@ -35,7 +40,7 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu return null; } - private URL asUrl(final String value) { + private URL asUrl(String value) { try { return new URL(value); } catch (MalformedURLException e) { @@ -44,4 +49,5 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu } } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 1f6cdc049..df66f538a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -6,7 +6,6 @@ import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.FieldDef; -import eu.dnetlib.pace.model.TreeNodeDef; import eu.dnetlib.pace.util.PaceResolver; import org.apache.commons.collections.CollectionUtils; import org.codehaus.jackson.annotate.JsonIgnore; @@ -24,11 +23,9 @@ public class PaceConfig implements Serializable { private List clustering; private Map> blacklists; - private Map decisionTree; - private Map modelMap; - public static PaceResolver paceResolver; + // public PaceResolver paceResolver; public PaceConfig() {} @@ -38,7 +35,7 @@ public class PaceConfig implements Serializable { modelMap.put(fd.getName(), fd); } - paceResolver = new PaceResolver(); +// paceResolver = new PaceResolver(); } public List getModel() { @@ -61,14 +58,6 @@ public class PaceConfig implements Serializable { return conditions; } - public Map getDecisionTree() { - return decisionTree; - } - - public void setDecisionTree(Map decisionTree) { - this.decisionTree = decisionTree; - } - @JsonIgnore public List getConditionAlgos() { return asConditionAlgos(getConditions()); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java index 0d8dd609c..ff8b34bf3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java @@ -32,6 +32,9 @@ public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo { ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim(); cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim(); + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); + final String cca = finalCleanup(ca); final String ccb = finalCleanup(cb); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index ece4de896..3ee4be266 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -2,12 +2,17 @@ package eu.dnetlib.pace.model; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import eu.dnetlib.pace.clustering.*; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.util.PaceException; +import eu.dnetlib.pace.util.PaceResolver; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.codehaus.jackson.map.ObjectMapper; public class ClusteringDef implements Serializable { @@ -18,6 +23,8 @@ public class ClusteringDef implements Serializable { private Map params; + PaceResolver paceResolver = new PaceResolver(); + public ClusteringDef() {} public String getName() { @@ -30,7 +37,7 @@ public class ClusteringDef implements Serializable { public ClusteringFunction clusteringFunction() { try { - return PaceConfig.paceResolver.getClusteringFunction(getName(), params); + return paceResolver.getClusteringFunction(getName(), params); } catch (PaceException e) { e.printStackTrace(); return null; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java index ccbca5897..b74a3cfce 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java @@ -7,6 +7,7 @@ import java.util.List; import eu.dnetlib.pace.condition.*; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.util.PaceException; +import eu.dnetlib.pace.util.PaceResolver; import org.codehaus.jackson.map.ObjectMapper; public class CondDef implements Serializable { @@ -15,10 +16,12 @@ public class CondDef implements Serializable { private List fields; + PaceResolver paceResolver = new PaceResolver(); + public CondDef() {} public ConditionAlgo conditionAlgo(final List fields){ - return PaceConfig.paceResolver.getConditionAlgo(getName(), fields); + return paceResolver.getConditionAlgo(getName(), fields); } public String getName() { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index ef74c0d18..56ca5a425 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -6,6 +6,7 @@ import com.google.gson.Gson; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.distance.DistanceAlgo; +import eu.dnetlib.pace.util.PaceResolver; import java.io.Serializable; import java.util.HashMap; @@ -33,6 +34,8 @@ public class FieldDef implements Serializable { private double weight; + PaceResolver paceResolver = new PaceResolver(); + /** * Sets maximum size for the repeatable fields in the model. -1 for unbounded size. */ @@ -85,7 +88,7 @@ public class FieldDef implements Serializable { params.put("length", getLength()); */ params.put("weight", getWeight()); - return PaceConfig.paceResolver.getDistanceAlgo(getAlgo(), params); + return paceResolver.getDistanceAlgo(getAlgo(), params); } public boolean isIgnoreMissing() { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java deleted file mode 100644 index b1d4917b3..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java +++ /dev/null @@ -1,145 +0,0 @@ -package eu.dnetlib.pace.model; - -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.tree.Comparator; -import eu.dnetlib.pace.tree.support.AggType; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; -import org.codehaus.jackson.map.ObjectMapper; - -import java.io.IOException; -import java.io.Serializable; -import java.util.List; - -public class TreeNodeDef implements Serializable { - - private List fields; //list of fields involved in the tree node (contains comparators to be used and field on which apply the comparator) - private AggType aggregation; //how to aggregate similarity measures for every field - - private double threshold; //threshold on the similarity measure - - private String positive; //specifies the next node in case of positive result: similarity>=th - private String negative; //specifies the next node in case of negative result: similarity fields, double threshold, AggType aggregation, String positive, String negative, String undefined) { - this.fields = fields; - this.threshold = threshold; - this.aggregation = aggregation; - this.positive = positive; - this.negative = negative; - this.undefined = undefined; - } - - public boolean isIgnoreMissing() { - return ignoreMissing; - } - - public void setIgnoreMissing(boolean ignoreMissing) { - this.ignoreMissing = ignoreMissing; - } - - public List getFields() { - return fields; - } - - public void setFields(List fields) { - this.fields = fields; - } - - public double getThreshold() { - return threshold; - } - - public void setThreshold(double threshold) { - this.threshold = threshold; - } - - public AggType getAggregation() { - return aggregation; - } - - public void setAggregation(AggType aggregation) { - this.aggregation = aggregation; - } - - public String getPositive() { - return positive; - } - - public void setPositive(String positive) { - this.positive = positive; - } - - public String getNegative() { - return negative; - } - - public void setNegative(String negative) { - this.negative = negative; - } - - public String getUndefined() { - return undefined; - } - - public void setUndefined(String undefined) { - this.undefined = undefined; - } - - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Impossible to convert to JSON: ", e); - } - } -} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractComparator.java deleted file mode 100644 index 76e41ae9c..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractComparator.java +++ /dev/null @@ -1,33 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.model.Field; -import org.apache.commons.lang.StringUtils; - -import java.util.Map; - -abstract class AbstractComparator implements Comparator { - - Map params; - - public AbstractComparator(Map params){ - this.params = params; - } - - @Override - public double compare(Field a, Field b) { - return 0.0; - } - - public static double stringSimilarity(String s1, String s2) { - String longer = s1, shorter = s2; - if (s1.length() < s2.length()) { // longer should always have greater length - longer = s2; shorter = s1; - } - int longerLength = longer.length(); - if (longerLength == 0) //if strings have 0 length return 0 (no similarity) - return 0.0; - - return (longerLength - StringUtils.getLevenshteinDistance(longer, shorter)) / (double) longerLength; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java deleted file mode 100644 index ace3acc21..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java +++ /dev/null @@ -1,42 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; - -import java.util.List; -import java.util.Map; - -@ComparatorClass("coauthorsMatch") -public class CoauthorsMatch extends AbstractComparator { - - public CoauthorsMatch(Map params) { - super(params); - } - - @Override - public double compare(Field a, Field b) { - - final List c1 = ((FieldList) a).stringList(); - final List c2 = ((FieldList) b).stringList(); - - int size1 = c1.size(); - int size2 = c2.size(); - - //few coauthors or too many coauthors - if (size1 < params.getOrDefault("minCoauthors", 5).intValue() || size2 < params.getOrDefault("minCoauthors", 5).intValue() || (size1+size2 > params.getOrDefault("maxCoauthors", 200).intValue())) - return -1; - - int coauthorship = 0; - for (String ca1: c1){ - - for (String ca2: c2){ - - if (stringSimilarity(ca1.replaceAll("\\.","").replaceAll(" ",""), ca2.replaceAll("\\.","").replaceAll(" ",""))>= params.getOrDefault("simTh", 0.7).doubleValue()) - coauthorship++; - } - } - - return coauthorship; - - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java deleted file mode 100644 index 087028ba2..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java +++ /dev/null @@ -1,10 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.model.Field; - -public interface Comparator { - - //compare two fields and returns: the distace measure, -1 if undefined - public double compare(Field a, Field b); - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ComparatorClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ComparatorClass.java deleted file mode 100644 index a04fba8ee..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ComparatorClass.java +++ /dev/null @@ -1,14 +0,0 @@ -package eu.dnetlib.pace.tree; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface ComparatorClass { - - public String value(); -} - diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java deleted file mode 100644 index 8e0e60173..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java +++ /dev/null @@ -1,25 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.model.Field; - -import java.util.Map; - -@ComparatorClass("exactMatch") -public class ExactMatch extends AbstractComparator { - - public ExactMatch(Map params) { - super(params); - } - - @Override - public double compare(Field a, Field b) { - - if (a.stringValue().isEmpty() || b.stringValue().isEmpty()) - return -1; - else if (a.stringValue().equals(b.stringValue())) - return 1; - else - return 0; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java deleted file mode 100644 index f8f5fe144..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java +++ /dev/null @@ -1,31 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.model.Field; - -import java.util.Map; - -@ComparatorClass("similar") -public class SimilarMatch extends AbstractComparator { - - public SimilarMatch(Map params) { - super(params); - } - - @Override - public double compare(Field a, Field b) { - - if (a.stringValue().isEmpty() || b.stringValue().isEmpty()) - return -1; //undefined if one name is missing - - //take only the first name - String firstname1 = a.stringValue().split(" ")[0]; - String firstname2 = b.stringValue().split(" ")[0]; - - if (firstname1.toLowerCase().trim().replaceAll("\\.","").replaceAll("\\s","").length()<=2 || firstname2.toLowerCase().replaceAll("\\.", "").replaceAll("\\s","").length()<=2) //too short names (considered similar) - return 1; - - return stringSimilarity(firstname1,firstname2); - - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java deleted file mode 100644 index ea798c7a7..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java +++ /dev/null @@ -1,36 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldListImpl; - -import java.util.Map; - -@ComparatorClass("topicsMatch") -public class TopicsMatch extends AbstractComparator { - - public TopicsMatch(Map params) { - super(params); - } - - @Override - public double compare(Field a, Field b) { - - double[] t1 = ((FieldListImpl) a).doubleArray(); - double[] t2 = ((FieldListImpl) b).doubleArray(); - - if (t1 == null || t2 == null) - return -1; //0 similarity if no topics in one of the authors or in both - - double area = 0.0; - - double min_value[] = new double[t1.length]; - for(int i=0; i params; - - @Override - public double compare(Field a, Field b) { - - final List sa = ((FieldList) a).stringList(); - final List sb = ((FieldList) b).stringList(); - - return 0; - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java deleted file mode 100644 index bd7bd9fb8..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java +++ /dev/null @@ -1,21 +0,0 @@ -package eu.dnetlib.pace.tree.support; - -import eu.dnetlib.pace.util.PaceException; - -public enum AggType { - - AVG, - SUM, - MAX, - MIN; - - public static AggType getEnum(String value) { - - try { - return AggType.valueOf(value); - } - catch (IllegalArgumentException e) { - throw new PaceException("Undefined aggregation type", e); - } - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java deleted file mode 100644 index 158d3f99f..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java +++ /dev/null @@ -1,20 +0,0 @@ -package eu.dnetlib.pace.tree.support; - -public enum MatchType { - - ORCID_MATCH, - COAUTHORS_MATCH, - TOPICS_MATCH, - NO_MATCH, - UNDEFINED; - - public static MatchType getEnum(String value) { - - try { - return MatchType.valueOf(value); - } - catch (IllegalArgumentException e) { - return MatchType.UNDEFINED; - } - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index d8276549a..f8ebb8038 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -7,8 +7,6 @@ import eu.dnetlib.pace.condition.ConditionClass; import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.FieldDef; -import eu.dnetlib.pace.tree.Comparator; -import eu.dnetlib.pace.tree.ComparatorClass; import org.reflections.Reflections; import java.io.Serializable; @@ -22,7 +20,6 @@ public class PaceResolver implements Serializable { private final Map> clusteringFunctions; private final Map> conditionAlgos; private final Map> distanceAlgos; - private final Map> comparators; public PaceResolver() { @@ -37,10 +34,6 @@ public class PaceResolver implements Serializable { this.distanceAlgos = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream() .filter(DistanceAlgo.class::isAssignableFrom) .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class)cl)); - - this.comparators = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ComparatorClass.class).stream() - .filter(Comparator.class::isAssignableFrom) - .collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class) cl)); } public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { @@ -67,12 +60,4 @@ public class PaceResolver implements Serializable { } } - public Comparator getComparator(String name, Map params) throws PaceException { - try { - return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { - throw new PaceException(name + " not found ", e); - } - } - }