forked from D-Net/dnet-hadoop
bug fixing in the keywordsclustering class
This commit is contained in:
parent
f5de20a508
commit
3c6f8d1e44
|
@ -19,16 +19,17 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
|
|
||||||
List<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4));
|
//takes city codes and keywords codes without duplicates
|
||||||
List<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4));
|
Set<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4));
|
||||||
|
Set<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4));
|
||||||
|
|
||||||
|
//list of combination to return as result
|
||||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||||
|
|
||||||
int size = 0;
|
|
||||||
for (String keyword: keywords){
|
for (String keyword: keywords){
|
||||||
for (String city: cities) {
|
for (String city: cities) {
|
||||||
combinations.add(keyword+"-"+city);
|
combinations.add(keyword+"-"+city);
|
||||||
if (++size>params.getOrDefault("max", 2)) {
|
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
||||||
return combinations;
|
return combinations;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -320,7 +320,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
//get the list of codes into the input string
|
//get the list of codes into the input string
|
||||||
public List<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){
|
public Set<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){
|
||||||
|
|
||||||
String s = cleanup(s1);
|
String s = cleanup(s1);
|
||||||
|
|
||||||
|
@ -328,7 +328,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
|
|
||||||
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||||
|
|
||||||
List<String> codes = new ArrayList<>();
|
Set<String> codes = new HashSet<>();
|
||||||
|
|
||||||
if (tokens.size()<windowSize)
|
if (tokens.size()<windowSize)
|
||||||
windowSize = tokens.size();
|
windowSize = tokens.size();
|
||||||
|
|
|
@ -129,6 +129,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
System.out.println(s1);
|
System.out.println(s1);
|
||||||
System.out.println(cf.apply(Lists.newArrayList(title(s1))));
|
System.out.println(cf.apply(Lists.newArrayList(title(s1))));
|
||||||
|
|
||||||
|
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
||||||
|
System.out.println("s2 = " + s2);
|
||||||
|
System.out.println(cf.apply(Lists.newArrayList(title(s2))));
|
||||||
|
|
||||||
|
final String s3 = "universita universita milano milano";
|
||||||
|
System.out.println("s3 = " + s3);
|
||||||
|
System.out.println(cf.apply(Lists.newArrayList(title(s3))));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
#release configuration
|
||||||
|
#Mon Jul 08 10:03:15 CEST 2019
|
||||||
|
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
||||||
|
pushChanges=true
|
||||||
|
scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git
|
||||||
|
preparationGoals=clean verify
|
||||||
|
projectVersionPolicyId=default
|
||||||
|
remoteTagging=true
|
||||||
|
scm.commentPrefix=[maven-release-plugin]
|
||||||
|
exec.snapshotReleasePluginAllowed=false
|
||||||
|
completedPhase=create-backup-poms
|
Loading…
Reference in New Issue