diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml
index f474e5693..3bc7480ee 100644
--- a/dnet-pace-core/pom.xml
+++ b/dnet-pace-core/pom.xml
@@ -6,7 +6,7 @@
eu.dnetlib
dnet-dedup
- 4.0.3
+ 4.0.3-SNAPSHOT
../pom.xml
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
index 6086ac0a8..1e94b34d2 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
@@ -22,6 +22,21 @@ public class WordsSuffixPrefix extends AbstractClusteringFunction {
private Collection suffixPrefix(String s, int len, int max) {
final int words = s.split(" ").length;
+
+ // adjust the token length according to the number of words
+ switch (words) {
+ case 1:
+ return Sets.newLinkedHashSet();
+ case 2:
+ return doSuffixPrefix(s, len+2, max, words);
+ case 3:
+ return doSuffixPrefix(s, len+1, max, words);
+ default:
+ return doSuffixPrefix(s, len, max, words);
+ }
+ }
+
+ private Collection doSuffixPrefix(String s, int len, int max, int words) {
final Set bigrams = Sets.newLinkedHashSet();
int i = 0;
while (++i < s.length() && bigrams.size() < max) {
@@ -39,4 +54,4 @@ public class WordsSuffixPrefix extends AbstractClusteringFunction {
return bigrams;
}
-}
+}
\ No newline at end of file
diff --git a/pom.xml.releaseBackup b/pom.xml.releaseBackup
deleted file mode 100644
index d840af173..000000000
--- a/pom.xml.releaseBackup
+++ /dev/null
@@ -1,333 +0,0 @@
-
-
-
- 4.0.0
-
- eu.dnetlib
- dnet-dedup
- 4.0.2-SNAPSHOT
-
- pom
-
- http://www.d-net.research-infrastructures.eu
-
-
-
- The Apache Software License, Version 2.0
- http://www.apache.org/licenses/LICENSE-2.0.txt
- repo
- A business-friendly OSS license
-
-
-
-
- scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git
- HEAD
-
-
-
- dnet-pace-core
- dnet-dedup-test
-
-
-
- Redmine
- https://issue.openaire.research-infrastructures.eu/projects/openaire
-
-
-
-
-
-
- dnet45-releases
- D-Net 45 Releases
- http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases
- default
-
-
-
-
-
- dnet-deps
- dnet-dependencies
- http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps
- default
-
-
- dnet45-releases
- D-Net 45 Releases
- http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases
- default
-
- true
-
-
-
- dnet45-snapshots
- D-Net 45 Snapshots
- http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots
- default
-
- true
-
-
-
-
- cloudera
- Cloudera Repository
- https://repository.cloudera.com/artifactory/cloudera-repos
-
- true
-
-
- false
-
-
-
-
- target
- target/classes
- ${project.artifactId}-${project.version}
- target/test-classes
-
-
-
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.6.0
-
-
- 1.8
- ${project.build.sourceEncoding}
-
-
-
-
- org.apache.maven.plugins
- maven-jar-plugin
- 3.0.2
-
-
-
- org.apache.maven.plugins
- maven-source-plugin
- 3.0.1
-
-
- attach-sources
- verify
-
- jar-no-fork
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-surefire-plugin
- 2.19.1
-
- true
-
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 2.10.4
-
- true
-
-
-
-
- org.apache.maven.plugins
- maven-dependency-plugin
- 3.0.0
-
-
-
- org.apache.maven.plugins
- maven-failsafe-plugin
- 2.13
-
-
- integration-test
-
- integration-test
-
-
-
- verify
-
- verify
-
-
-
-
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-release-plugin
- 2.5.3
-
-
-
-
-
-
-
-
- edu.cmu
- secondstring
- 1.0.0
-
-
- org.antlr
- stringtemplate
- 3.2
-
-
-
- com.fasterxml.jackson.core
- jackson-databind
- ${jackson.version}
-
-
-
- com.fasterxml.jackson.dataformat
- jackson-dataformat-xml
- ${jackson.version}
-
-
- com.fasterxml.jackson.module
- jackson-module-jsonSchema
- ${jackson.version}
-
-
-
-
-
- org.apache.commons
- commons-math3
- 3.6.1
-
-
-
- com.google.guava
- guava
- ${google.guava.version}
-
-
- com.google.code.gson
- gson
- ${google.gson.version}
-
-
-
- org.apache.commons
- commons-lang3
- ${commons.lang.version}
-
-
-
- commons-io
- commons-io
- ${commons.io.version}
-
-
- commons-collections
- commons-collections
- ${commons.collections.version}
-
-
- commons-logging
- commons-logging
- ${commons.logging.version}
-
-
- org.apache.spark
- spark-core_2.11
- ${spark.version}
- provided
-
-
- org.apache.spark
- spark-graphx_2.11
- ${spark.version}
- provided
-
-
- org.apache.spark
- spark-sql_2.11
- ${spark.version}
- provided
-
-
- junit
- junit
- ${junit.version}
- test
-
-
- org.reflections
- reflections
- 0.9.10
-
-
-
- org.scala-lang
- scala-library
- ${scala.version}
-
-
-
- org.apache.oozie
- oozie-client
- 5.1.0
-
-
- com.jayway.jsonpath
- json-path
- 2.4.0
-
-
-
-
-
-
-
-
- UTF-8
- UTF-8
-
- 2.2.2
- 15.0
-
- 2.2.0
- 2.6.6
-
- 3.5
- 2.4
- 3.2.1
- 1.1.3
-
- 4.9
- 2.11.8
-
- false
-
-