wordssuffixprefix: adjust the token length according to the number of words; removed maven release temporary files

This commit is contained in:
Claudio Atzori 2020-07-15 16:49:47 +02:00
parent b7a27ace62
commit b46be9c8ae
3 changed files with 15 additions and 355 deletions

View File

@ -22,6 +22,21 @@ public class WordsSuffixPrefix extends AbstractClusteringFunction {
private Collection<String> suffixPrefix(String s, int len, int max) {
final int words = s.split(" ").length;
// adjust the token length according to the number of words
switch (words) {
case 1:
return Sets.newLinkedHashSet();
case 2:
return doSuffixPrefix(s, len+2, max, words);
case 3:
return doSuffixPrefix(s, len+1, max, words);
default:
return doSuffixPrefix(s, len, max, words);
}
}
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
final Set<String> bigrams = Sets.newLinkedHashSet();
int i = 0;
while (++i < s.length() && bigrams.size() < max) {

View File

@ -1,333 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.0.2-SNAPSHOT</version>
<packaging>pom</packaging>
<url>http://www.d-net.research-infrastructures.eu</url>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
<comments>A business-friendly OSS license</comments>
</license>
</licenses>
<scm>
<developerConnection>scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git</developerConnection>
<tag>HEAD</tag>
</scm>
<modules>
<module>dnet-pace-core</module>
<module>dnet-dedup-test</module>
</modules>
<issueManagement>
<system>Redmine</system>
<url>https://issue.openaire.research-infrastructures.eu/projects/openaire</url>
</issueManagement>
<distributionManagement>
<repository>
<id>dnet45-releases</id>
<name>D-Net 45 Releases</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout>
</repository>
</distributionManagement>
<repositories>
<repository>
<id>dnet-deps</id>
<name>dnet-dependencies</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps</url>
<layout>default</layout>
</repository>
<repository>
<id>dnet45-releases</id>
<name>D-Net 45 Releases</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<repository>
<id>dnet45-snapshots</id>
<name>D-Net 45 Snapshots</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
<layout>default</layout>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<repository>
<id>cloudera</id>
<name>Cloudera Repository</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
<build>
<directory>target</directory>
<outputDirectory>target/classes</outputDirectory>
<finalName>${project.artifactId}-${project.version}</finalName>
<testOutputDirectory>target/test-classes</testOutputDirectory>
<!--*************************************************-->
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.0.1</version>
<executions>
<execution>
<id>attach-sources</id>
<phase>verify</phase>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<redirectTestOutputToFile>true</redirectTestOutputToFile>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
<configuration>
<detectLinks>true</detectLinks>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-failsafe-plugin</artifactId>
<version>2.13</version>
<executions>
<execution>
<id>integration-test</id>
<goals>
<goal>integration-test</goal>
</goals>
</execution>
<execution>
<id>verify</id>
<goals>
<goal>verify</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.5.3</version>
</plugin>
</plugins>
</build>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
<version>3.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-xml</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-jsonSchema</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.6.1</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${google.guava.version}</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>${google.gson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons.lang.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons.io.version}</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>${commons.collections.version}</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>${commons.logging.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.9.10</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.oozie</groupId>
<artifactId>oozie-client</artifactId>
<version>5.1.0</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>2.4.0</version>
</dependency>
</dependencies>
</dependencyManagement>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<google.gson.version>2.2.2</google.gson.version>
<google.guava.version>15.0</google.guava.version>
<spark.version>2.2.0</spark.version>
<jackson.version>2.6.6</jackson.version>
<commons.lang.version>3.5</commons.lang.version>
<commons.io.version>2.4</commons.io.version>
<commons.collections.version>3.2.1</commons.collections.version>
<commons.logging.version>1.1.3</commons.logging.version>
<junit.version>4.9</junit.version>
<scala.version>2.11.8</scala.version>
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
</properties>
</project>

View File

@ -1,22 +0,0 @@
#release configuration
#Thu Jul 02 17:06:39 CEST 2020
scm.commentPrefix=[maven-release-plugin]
pushChanges=true
project.rel.eu.dnetlib\:dnet-dedup-test=4.0.2
scm.tag=dnet-dedup-4.0.2
remoteTagging=true
project.scm.eu.dnetlib\:dnet-dedup-test.empty=true
projectVersionPolicyId=default
scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git
scm.tagNameFormat=@{project.artifactId}-@{project.version}
project.rel.eu.dnetlib\:dnet-dedup=4.0.2
project.dev.eu.dnetlib\:dnet-pace-core=4.0.3-SNAPSHOT
preparationGoals=clean verify
project.scm.eu.dnetlib\:dnet-dedup.tag=HEAD
project.scm.eu.dnetlib\:dnet-dedup.developerConnection=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git
exec.snapshotReleasePluginAllowed=false
project.dev.eu.dnetlib\:dnet-dedup=4.0.3-SNAPSHOT
project.scm.eu.dnetlib\:dnet-pace-core.empty=true
project.dev.eu.dnetlib\:dnet-dedup-test=4.0.3-SNAPSHOT
completedPhase=end-release
project.rel.eu.dnetlib\:dnet-pace-core=4.0.2