wordssuffixprefix: adjust the token length according to the number of words; removed maven release temporary files
This commit is contained in:
parent
b7a27ace62
commit
b46be9c8ae
|
@ -22,6 +22,21 @@ public class WordsSuffixPrefix extends AbstractClusteringFunction {
|
||||||
private Collection<String> suffixPrefix(String s, int len, int max) {
|
private Collection<String> suffixPrefix(String s, int len, int max) {
|
||||||
|
|
||||||
final int words = s.split(" ").length;
|
final int words = s.split(" ").length;
|
||||||
|
|
||||||
|
// adjust the token length according to the number of words
|
||||||
|
switch (words) {
|
||||||
|
case 1:
|
||||||
|
return Sets.newLinkedHashSet();
|
||||||
|
case 2:
|
||||||
|
return doSuffixPrefix(s, len+2, max, words);
|
||||||
|
case 3:
|
||||||
|
return doSuffixPrefix(s, len+1, max, words);
|
||||||
|
default:
|
||||||
|
return doSuffixPrefix(s, len, max, words);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
|
||||||
final Set<String> bigrams = Sets.newLinkedHashSet();
|
final Set<String> bigrams = Sets.newLinkedHashSet();
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while (++i < s.length() && bigrams.size() < max) {
|
while (++i < s.length() && bigrams.size() < max) {
|
||||||
|
|
|
@ -1,333 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
|
||||||
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
<groupId>eu.dnetlib</groupId>
|
|
||||||
<artifactId>dnet-dedup</artifactId>
|
|
||||||
<version>4.0.2-SNAPSHOT</version>
|
|
||||||
|
|
||||||
<packaging>pom</packaging>
|
|
||||||
|
|
||||||
<url>http://www.d-net.research-infrastructures.eu</url>
|
|
||||||
|
|
||||||
<licenses>
|
|
||||||
<license>
|
|
||||||
<name>The Apache Software License, Version 2.0</name>
|
|
||||||
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
|
|
||||||
<distribution>repo</distribution>
|
|
||||||
<comments>A business-friendly OSS license</comments>
|
|
||||||
</license>
|
|
||||||
</licenses>
|
|
||||||
|
|
||||||
<scm>
|
|
||||||
<developerConnection>scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git</developerConnection>
|
|
||||||
<tag>HEAD</tag>
|
|
||||||
</scm>
|
|
||||||
|
|
||||||
<modules>
|
|
||||||
<module>dnet-pace-core</module>
|
|
||||||
<module>dnet-dedup-test</module>
|
|
||||||
</modules>
|
|
||||||
|
|
||||||
<issueManagement>
|
|
||||||
<system>Redmine</system>
|
|
||||||
<url>https://issue.openaire.research-infrastructures.eu/projects/openaire</url>
|
|
||||||
</issueManagement>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<distributionManagement>
|
|
||||||
<repository>
|
|
||||||
<id>dnet45-releases</id>
|
|
||||||
<name>D-Net 45 Releases</name>
|
|
||||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
|
|
||||||
<layout>default</layout>
|
|
||||||
</repository>
|
|
||||||
</distributionManagement>
|
|
||||||
|
|
||||||
<repositories>
|
|
||||||
<repository>
|
|
||||||
<id>dnet-deps</id>
|
|
||||||
<name>dnet-dependencies</name>
|
|
||||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps</url>
|
|
||||||
<layout>default</layout>
|
|
||||||
</repository>
|
|
||||||
<repository>
|
|
||||||
<id>dnet45-releases</id>
|
|
||||||
<name>D-Net 45 Releases</name>
|
|
||||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
|
|
||||||
<layout>default</layout>
|
|
||||||
<snapshots>
|
|
||||||
<enabled>true</enabled>
|
|
||||||
</snapshots>
|
|
||||||
</repository>
|
|
||||||
<repository>
|
|
||||||
<id>dnet45-snapshots</id>
|
|
||||||
<name>D-Net 45 Snapshots</name>
|
|
||||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
|
|
||||||
<layout>default</layout>
|
|
||||||
<snapshots>
|
|
||||||
<enabled>true</enabled>
|
|
||||||
</snapshots>
|
|
||||||
</repository>
|
|
||||||
|
|
||||||
<repository>
|
|
||||||
<id>cloudera</id>
|
|
||||||
<name>Cloudera Repository</name>
|
|
||||||
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
|
|
||||||
<releases>
|
|
||||||
<enabled>true</enabled>
|
|
||||||
</releases>
|
|
||||||
<snapshots>
|
|
||||||
<enabled>false</enabled>
|
|
||||||
</snapshots>
|
|
||||||
</repository>
|
|
||||||
</repositories>
|
|
||||||
<build>
|
|
||||||
<directory>target</directory>
|
|
||||||
<outputDirectory>target/classes</outputDirectory>
|
|
||||||
<finalName>${project.artifactId}-${project.version}</finalName>
|
|
||||||
<testOutputDirectory>target/test-classes</testOutputDirectory>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<!--*************************************************-->
|
|
||||||
|
|
||||||
<pluginManagement>
|
|
||||||
<plugins>
|
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
|
||||||
<version>3.6.0</version>
|
|
||||||
<configuration>
|
|
||||||
<source>1.8</source>
|
|
||||||
<target>1.8</target>
|
|
||||||
<encoding>${project.build.sourceEncoding}</encoding>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-jar-plugin</artifactId>
|
|
||||||
<version>3.0.2</version>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-source-plugin</artifactId>
|
|
||||||
<version>3.0.1</version>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<id>attach-sources</id>
|
|
||||||
<phase>verify</phase>
|
|
||||||
<goals>
|
|
||||||
<goal>jar-no-fork</goal>
|
|
||||||
</goals>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-surefire-plugin</artifactId>
|
|
||||||
<version>2.19.1</version>
|
|
||||||
<configuration>
|
|
||||||
<redirectTestOutputToFile>true</redirectTestOutputToFile>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-javadoc-plugin</artifactId>
|
|
||||||
<version>2.10.4</version>
|
|
||||||
<configuration>
|
|
||||||
<detectLinks>true</detectLinks>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-dependency-plugin</artifactId>
|
|
||||||
<version>3.0.0</version>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-failsafe-plugin</artifactId>
|
|
||||||
<version>2.13</version>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<id>integration-test</id>
|
|
||||||
<goals>
|
|
||||||
<goal>integration-test</goal>
|
|
||||||
</goals>
|
|
||||||
</execution>
|
|
||||||
<execution>
|
|
||||||
<id>verify</id>
|
|
||||||
<goals>
|
|
||||||
<goal>verify</goal>
|
|
||||||
</goals>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
</plugins>
|
|
||||||
</pluginManagement>
|
|
||||||
|
|
||||||
<plugins>
|
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-release-plugin</artifactId>
|
|
||||||
<version>2.5.3</version>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
</plugins>
|
|
||||||
</build>
|
|
||||||
|
|
||||||
<dependencyManagement>
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>edu.cmu</groupId>
|
|
||||||
<artifactId>secondstring</artifactId>
|
|
||||||
<version>1.0.0</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.antlr</groupId>
|
|
||||||
<artifactId>stringtemplate</artifactId>
|
|
||||||
<version>3.2</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
|
||||||
<artifactId>jackson-databind</artifactId>
|
|
||||||
<version>${jackson.version}</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.fasterxml.jackson.dataformat</groupId>
|
|
||||||
<artifactId>jackson-dataformat-xml</artifactId>
|
|
||||||
<version>${jackson.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.fasterxml.jackson.module</groupId>
|
|
||||||
<artifactId>jackson-module-jsonSchema</artifactId>
|
|
||||||
<version>${jackson.version}</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.commons</groupId>
|
|
||||||
<artifactId>commons-math3</artifactId>
|
|
||||||
<version>3.6.1</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.guava</groupId>
|
|
||||||
<artifactId>guava</artifactId>
|
|
||||||
<version>${google.guava.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.code.gson</groupId>
|
|
||||||
<artifactId>gson</artifactId>
|
|
||||||
<version>${google.gson.version}</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.commons</groupId>
|
|
||||||
<artifactId>commons-lang3</artifactId>
|
|
||||||
<version>${commons.lang.version}</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>commons-io</groupId>
|
|
||||||
<artifactId>commons-io</artifactId>
|
|
||||||
<version>${commons.io.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>commons-collections</groupId>
|
|
||||||
<artifactId>commons-collections</artifactId>
|
|
||||||
<version>${commons.collections.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>commons-logging</groupId>
|
|
||||||
<artifactId>commons-logging</artifactId>
|
|
||||||
<version>${commons.logging.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.spark</groupId>
|
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
|
||||||
<version>${spark.version}</version>
|
|
||||||
<scope>provided</scope>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.spark</groupId>
|
|
||||||
<artifactId>spark-graphx_2.11</artifactId>
|
|
||||||
<version>${spark.version}</version>
|
|
||||||
<scope>provided</scope>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.spark</groupId>
|
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
|
||||||
<version>${spark.version}</version>
|
|
||||||
<scope>provided</scope>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
<version>${junit.version}</version>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.reflections</groupId>
|
|
||||||
<artifactId>reflections</artifactId>
|
|
||||||
<version>0.9.10</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.scala-lang</groupId>
|
|
||||||
<artifactId>scala-library</artifactId>
|
|
||||||
<version>${scala.version}</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.oozie</groupId>
|
|
||||||
<artifactId>oozie-client</artifactId>
|
|
||||||
<version>5.1.0</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
|
||||||
<artifactId>json-path</artifactId>
|
|
||||||
<version>2.4.0</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
|
|
||||||
</dependencies>
|
|
||||||
</dependencyManagement>
|
|
||||||
|
|
||||||
<properties>
|
|
||||||
|
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
|
||||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
|
||||||
|
|
||||||
<google.gson.version>2.2.2</google.gson.version>
|
|
||||||
<google.guava.version>15.0</google.guava.version>
|
|
||||||
|
|
||||||
<spark.version>2.2.0</spark.version>
|
|
||||||
<jackson.version>2.6.6</jackson.version>
|
|
||||||
|
|
||||||
<commons.lang.version>3.5</commons.lang.version>
|
|
||||||
<commons.io.version>2.4</commons.io.version>
|
|
||||||
<commons.collections.version>3.2.1</commons.collections.version>
|
|
||||||
<commons.logging.version>1.1.3</commons.logging.version>
|
|
||||||
|
|
||||||
<junit.version>4.9</junit.version>
|
|
||||||
<scala.version>2.11.8</scala.version>
|
|
||||||
|
|
||||||
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
|
||||||
</properties>
|
|
||||||
</project>
|
|
|
@ -1,22 +0,0 @@
|
||||||
#release configuration
|
|
||||||
#Thu Jul 02 17:06:39 CEST 2020
|
|
||||||
scm.commentPrefix=[maven-release-plugin]
|
|
||||||
pushChanges=true
|
|
||||||
project.rel.eu.dnetlib\:dnet-dedup-test=4.0.2
|
|
||||||
scm.tag=dnet-dedup-4.0.2
|
|
||||||
remoteTagging=true
|
|
||||||
project.scm.eu.dnetlib\:dnet-dedup-test.empty=true
|
|
||||||
projectVersionPolicyId=default
|
|
||||||
scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git
|
|
||||||
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
|
||||||
project.rel.eu.dnetlib\:dnet-dedup=4.0.2
|
|
||||||
project.dev.eu.dnetlib\:dnet-pace-core=4.0.3-SNAPSHOT
|
|
||||||
preparationGoals=clean verify
|
|
||||||
project.scm.eu.dnetlib\:dnet-dedup.tag=HEAD
|
|
||||||
project.scm.eu.dnetlib\:dnet-dedup.developerConnection=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git
|
|
||||||
exec.snapshotReleasePluginAllowed=false
|
|
||||||
project.dev.eu.dnetlib\:dnet-dedup=4.0.3-SNAPSHOT
|
|
||||||
project.scm.eu.dnetlib\:dnet-pace-core.empty=true
|
|
||||||
project.dev.eu.dnetlib\:dnet-dedup-test=4.0.3-SNAPSHOT
|
|
||||||
completedPhase=end-release
|
|
||||||
project.rel.eu.dnetlib\:dnet-pace-core=4.0.2
|
|
Loading…
Reference in New Issue