wordssuffixprefix: adjust the token length according to the number of words; removed maven release temporary files
This commit is contained in:
parent
b79ea97107
commit
51d91fa520
|
@ -22,6 +22,21 @@ public class WordsSuffixPrefix extends AbstractClusteringFunction {
|
|||
private Collection<String> suffixPrefix(String s, int len, int max) {
|
||||
|
||||
final int words = s.split(" ").length;
|
||||
|
||||
// adjust the token length according to the number of words
|
||||
switch (words) {
|
||||
case 1:
|
||||
return Sets.newLinkedHashSet();
|
||||
case 2:
|
||||
return doSuffixPrefix(s, len+2, max, words);
|
||||
case 3:
|
||||
return doSuffixPrefix(s, len+1, max, words);
|
||||
default:
|
||||
return doSuffixPrefix(s, len, max, words);
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
|
||||
final Set<String> bigrams = Sets.newLinkedHashSet();
|
||||
int i = 0;
|
||||
while (++i < s.length() && bigrams.size() < max) {
|
||||
|
@ -39,4 +54,4 @@ public class WordsSuffixPrefix extends AbstractClusteringFunction {
|
|||
return bigrams;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,333 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-dedup</artifactId>
|
||||
<version>4.0.2-SNAPSHOT</version>
|
||||
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<url>http://www.d-net.research-infrastructures.eu</url>
|
||||
|
||||
<licenses>
|
||||
<license>
|
||||
<name>The Apache Software License, Version 2.0</name>
|
||||
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
|
||||
<distribution>repo</distribution>
|
||||
<comments>A business-friendly OSS license</comments>
|
||||
</license>
|
||||
</licenses>
|
||||
|
||||
<scm>
|
||||
<developerConnection>scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git</developerConnection>
|
||||
<tag>HEAD</tag>
|
||||
</scm>
|
||||
|
||||
<modules>
|
||||
<module>dnet-pace-core</module>
|
||||
<module>dnet-dedup-test</module>
|
||||
</modules>
|
||||
|
||||
<issueManagement>
|
||||
<system>Redmine</system>
|
||||
<url>https://issue.openaire.research-infrastructures.eu/projects/openaire</url>
|
||||
</issueManagement>
|
||||
|
||||
|
||||
|
||||
<distributionManagement>
|
||||
<repository>
|
||||
<id>dnet45-releases</id>
|
||||
<name>D-Net 45 Releases</name>
|
||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
|
||||
<layout>default</layout>
|
||||
</repository>
|
||||
</distributionManagement>
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>dnet-deps</id>
|
||||
<name>dnet-dependencies</name>
|
||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps</url>
|
||||
<layout>default</layout>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>dnet45-releases</id>
|
||||
<name>D-Net 45 Releases</name>
|
||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
|
||||
<layout>default</layout>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>dnet45-snapshots</id>
|
||||
<name>D-Net 45 Snapshots</name>
|
||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
|
||||
<layout>default</layout>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
|
||||
<repository>
|
||||
<id>cloudera</id>
|
||||
<name>Cloudera Repository</name>
|
||||
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
</repositories>
|
||||
<build>
|
||||
<directory>target</directory>
|
||||
<outputDirectory>target/classes</outputDirectory>
|
||||
<finalName>${project.artifactId}-${project.version}</finalName>
|
||||
<testOutputDirectory>target/test-classes</testOutputDirectory>
|
||||
|
||||
|
||||
|
||||
|
||||
<!--*************************************************-->
|
||||
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.6.0</version>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
<encoding>${project.build.sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<version>3.0.2</version>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>3.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-sources</id>
|
||||
<phase>verify</phase>
|
||||
<goals>
|
||||
<goal>jar-no-fork</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>2.19.1</version>
|
||||
<configuration>
|
||||
<redirectTestOutputToFile>true</redirectTestOutputToFile>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>2.10.4</version>
|
||||
<configuration>
|
||||
<detectLinks>true</detectLinks>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<version>3.0.0</version>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-failsafe-plugin</artifactId>
|
||||
<version>2.13</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>integration-test</id>
|
||||
<goals>
|
||||
<goal>integration-test</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>verify</id>
|
||||
<goals>
|
||||
<goal>verify</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
|
||||
<plugins>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-release-plugin</artifactId>
|
||||
<version>2.5.3</version>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencyManagement>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>stringtemplate</artifactId>
|
||||
<version>3.2</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
<version>${jackson.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.dataformat</groupId>
|
||||
<artifactId>jackson-dataformat-xml</artifactId>
|
||||
<version>${jackson.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.module</groupId>
|
||||
<artifactId>jackson-module-jsonSchema</artifactId>
|
||||
<version>${jackson.version}</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-math3</artifactId>
|
||||
<version>3.6.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>${google.guava.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.code.gson</groupId>
|
||||
<artifactId>gson</artifactId>
|
||||
<version>${google.gson.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>${commons.lang.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<version>${commons.io.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-collections</groupId>
|
||||
<artifactId>commons-collections</artifactId>
|
||||
<version>${commons.collections.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-logging</groupId>
|
||||
<artifactId>commons-logging</artifactId>
|
||||
<version>${commons.logging.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-graphx_2.11</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>${junit.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.reflections</groupId>
|
||||
<artifactId>reflections</artifactId>
|
||||
<version>0.9.10</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-library</artifactId>
|
||||
<version>${scala.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.oozie</groupId>
|
||||
<artifactId>oozie-client</artifactId>
|
||||
<version>5.1.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
<version>2.4.0</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
||||
<properties>
|
||||
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
|
||||
<google.gson.version>2.2.2</google.gson.version>
|
||||
<google.guava.version>15.0</google.guava.version>
|
||||
|
||||
<spark.version>2.2.0</spark.version>
|
||||
<jackson.version>2.6.6</jackson.version>
|
||||
|
||||
<commons.lang.version>3.5</commons.lang.version>
|
||||
<commons.io.version>2.4</commons.io.version>
|
||||
<commons.collections.version>3.2.1</commons.collections.version>
|
||||
<commons.logging.version>1.1.3</commons.logging.version>
|
||||
|
||||
<junit.version>4.9</junit.version>
|
||||
<scala.version>2.11.8</scala.version>
|
||||
|
||||
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
||||
</properties>
|
||||
</project>
|
|
@ -1,22 +0,0 @@
|
|||
#release configuration
|
||||
#Thu Jul 02 17:06:39 CEST 2020
|
||||
scm.commentPrefix=[maven-release-plugin]
|
||||
pushChanges=true
|
||||
project.rel.eu.dnetlib\:dnet-dedup-test=4.0.2
|
||||
scm.tag=dnet-dedup-4.0.2
|
||||
remoteTagging=true
|
||||
project.scm.eu.dnetlib\:dnet-dedup-test.empty=true
|
||||
projectVersionPolicyId=default
|
||||
scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git
|
||||
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
||||
project.rel.eu.dnetlib\:dnet-dedup=4.0.2
|
||||
project.dev.eu.dnetlib\:dnet-pace-core=4.0.3-SNAPSHOT
|
||||
preparationGoals=clean verify
|
||||
project.scm.eu.dnetlib\:dnet-dedup.tag=HEAD
|
||||
project.scm.eu.dnetlib\:dnet-dedup.developerConnection=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git
|
||||
exec.snapshotReleasePluginAllowed=false
|
||||
project.dev.eu.dnetlib\:dnet-dedup=4.0.3-SNAPSHOT
|
||||
project.scm.eu.dnetlib\:dnet-pace-core.empty=true
|
||||
project.dev.eu.dnetlib\:dnet-dedup-test=4.0.3-SNAPSHOT
|
||||
completedPhase=end-release
|
||||
project.rel.eu.dnetlib\:dnet-pace-core=4.0.2
|
Loading…
Reference in New Issue