diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java index 6086ac0..80459cb 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java @@ -22,6 +22,21 @@ public class WordsSuffixPrefix extends AbstractClusteringFunction { private Collection suffixPrefix(String s, int len, int max) { final int words = s.split(" ").length; + + // adjust the token length according to the number of words + switch (words) { + case 1: + return Sets.newLinkedHashSet(); + case 2: + return doSuffixPrefix(s, len+2, max, words); + case 3: + return doSuffixPrefix(s, len+1, max, words); + default: + return doSuffixPrefix(s, len, max, words); + } + } + + private Collection doSuffixPrefix(String s, int len, int max, int words) { final Set bigrams = Sets.newLinkedHashSet(); int i = 0; while (++i < s.length() && bigrams.size() < max) { diff --git a/pom.xml.releaseBackup b/pom.xml.releaseBackup deleted file mode 100644 index d840af1..0000000 --- a/pom.xml.releaseBackup +++ /dev/null @@ -1,333 +0,0 @@ - - - - 4.0.0 - - eu.dnetlib - dnet-dedup - 4.0.2-SNAPSHOT - - pom - - http://www.d-net.research-infrastructures.eu - - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - A business-friendly OSS license - - - - - scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git - HEAD - - - - dnet-pace-core - dnet-dedup-test - - - - Redmine - https://issue.openaire.research-infrastructures.eu/projects/openaire - - - - - - - dnet45-releases - D-Net 45 Releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases - default - - - - - - dnet-deps - dnet-dependencies - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps - default - - - dnet45-releases - D-Net 45 Releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases - default - - true - - - - dnet45-snapshots - D-Net 45 Snapshots - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots - default - - true - - - - - cloudera - Cloudera Repository - https://repository.cloudera.com/artifactory/cloudera-repos - - true - - - false - - - - - target - target/classes - ${project.artifactId}-${project.version} - target/test-classes - - - - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.6.0 - - 1.8 - 1.8 - ${project.build.sourceEncoding} - - - - - org.apache.maven.plugins - maven-jar-plugin - 3.0.2 - - - - org.apache.maven.plugins - maven-source-plugin - 3.0.1 - - - attach-sources - verify - - jar-no-fork - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - 2.19.1 - - true - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.4 - - true - - - - - org.apache.maven.plugins - maven-dependency-plugin - 3.0.0 - - - - org.apache.maven.plugins - maven-failsafe-plugin - 2.13 - - - integration-test - - integration-test - - - - verify - - verify - - - - - - - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.5.3 - - - - - - - - - edu.cmu - secondstring - 1.0.0 - - - org.antlr - stringtemplate - 3.2 - - - - com.fasterxml.jackson.core - jackson-databind - ${jackson.version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-xml - ${jackson.version} - - - com.fasterxml.jackson.module - jackson-module-jsonSchema - ${jackson.version} - - - - - - org.apache.commons - commons-math3 - 3.6.1 - - - - com.google.guava - guava - ${google.guava.version} - - - com.google.code.gson - gson - ${google.gson.version} - - - - org.apache.commons - commons-lang3 - ${commons.lang.version} - - - - commons-io - commons-io - ${commons.io.version} - - - commons-collections - commons-collections - ${commons.collections.version} - - - commons-logging - commons-logging - ${commons.logging.version} - - - org.apache.spark - spark-core_2.11 - ${spark.version} - provided - - - org.apache.spark - spark-graphx_2.11 - ${spark.version} - provided - - - org.apache.spark - spark-sql_2.11 - ${spark.version} - provided - - - junit - junit - ${junit.version} - test - - - org.reflections - reflections - 0.9.10 - - - - org.scala-lang - scala-library - ${scala.version} - - - - org.apache.oozie - oozie-client - 5.1.0 - - - com.jayway.jsonpath - json-path - 2.4.0 - - - - - - - - - UTF-8 - UTF-8 - - 2.2.2 - 15.0 - - 2.2.0 - 2.6.6 - - 3.5 - 2.4 - 3.2.1 - 1.1.3 - - 4.9 - 2.11.8 - - false - - diff --git a/release.properties b/release.properties deleted file mode 100644 index 70a3c40..0000000 --- a/release.properties +++ /dev/null @@ -1,22 +0,0 @@ -#release configuration -#Thu Jul 02 17:06:39 CEST 2020 -scm.commentPrefix=[maven-release-plugin] -pushChanges=true -project.rel.eu.dnetlib\:dnet-dedup-test=4.0.2 -scm.tag=dnet-dedup-4.0.2 -remoteTagging=true -project.scm.eu.dnetlib\:dnet-dedup-test.empty=true -projectVersionPolicyId=default -scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git -scm.tagNameFormat=@{project.artifactId}-@{project.version} -project.rel.eu.dnetlib\:dnet-dedup=4.0.2 -project.dev.eu.dnetlib\:dnet-pace-core=4.0.3-SNAPSHOT -preparationGoals=clean verify -project.scm.eu.dnetlib\:dnet-dedup.tag=HEAD -project.scm.eu.dnetlib\:dnet-dedup.developerConnection=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git -exec.snapshotReleasePluginAllowed=false -project.dev.eu.dnetlib\:dnet-dedup=4.0.3-SNAPSHOT -project.scm.eu.dnetlib\:dnet-pace-core.empty=true -project.dev.eu.dnetlib\:dnet-dedup-test=4.0.3-SNAPSHOT -completedPhase=end-release -project.rel.eu.dnetlib\:dnet-pace-core=4.0.2