bug fixing in the keywordsclustering class

This commit is contained in:
miconis 2019-07-08 11:01:49 +02:00
parent f5de20a508
commit 3c6f8d1e44
4 changed files with 25 additions and 6 deletions

View File

@ -19,16 +19,17 @@ public class KeywordsClustering extends AbstractClusteringFunction {
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(String s) {
List<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4)); //takes city codes and keywords codes without duplicates
List<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4)); Set<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4));
Set<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4));
//list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>(); final Collection<String> combinations = new LinkedHashSet<String>();
int size = 0;
for (String keyword: keywords){ for (String keyword: keywords){
for (String city: cities) { for (String city: cities) {
combinations.add(keyword+"-"+city); combinations.add(keyword+"-"+city);
if (++size>params.getOrDefault("max", 2)) { if (combinations.size()>=params.getOrDefault("max", 2)) {
return combinations; return combinations;
} }
} }

View File

@ -320,7 +320,7 @@ public abstract class AbstractPaceFunctions {
} }
//get the list of codes into the input string //get the list of codes into the input string
public List<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){ public Set<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){
String s = cleanup(s1); String s = cleanup(s1);
@ -328,7 +328,7 @@ public abstract class AbstractPaceFunctions {
List<String> tokens = Arrays.asList(s.toLowerCase().split(" ")); List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
List<String> codes = new ArrayList<>(); Set<String> codes = new HashSet<>();
if (tokens.size()<windowSize) if (tokens.size()<windowSize)
windowSize = tokens.size(); windowSize = tokens.size();

View File

@ -129,6 +129,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(s1); System.out.println(s1);
System.out.println(cf.apply(Lists.newArrayList(title(s1)))); System.out.println(cf.apply(Lists.newArrayList(title(s1))));
final String s2 = "Universita farmaceutica culturale di milano bergamo";
System.out.println("s2 = " + s2);
System.out.println(cf.apply(Lists.newArrayList(title(s2))));
final String s3 = "universita universita milano milano";
System.out.println("s3 = " + s3);
System.out.println(cf.apply(Lists.newArrayList(title(s3))));
} }

11
release.properties Normal file
View File

@ -0,0 +1,11 @@
#release configuration
#Mon Jul 08 10:03:15 CEST 2019
scm.tagNameFormat=@{project.artifactId}-@{project.version}
pushChanges=true
scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git
preparationGoals=clean verify
projectVersionPolicyId=default
remoteTagging=true
scm.commentPrefix=[maven-release-plugin]
exec.snapshotReleasePluginAllowed=false
completedPhase=create-backup-poms