forked from D-Net/dnet-hadoop
bug fixing in the keywordsclustering class
This commit is contained in:
parent
f5de20a508
commit
3c6f8d1e44
|
@ -19,16 +19,17 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
|||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
|
||||
List<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4));
|
||||
List<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4));
|
||||
//takes city codes and keywords codes without duplicates
|
||||
Set<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4));
|
||||
Set<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4));
|
||||
|
||||
//list of combination to return as result
|
||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||
|
||||
int size = 0;
|
||||
for (String keyword: keywords){
|
||||
for (String city: cities) {
|
||||
combinations.add(keyword+"-"+city);
|
||||
if (++size>params.getOrDefault("max", 2)) {
|
||||
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
||||
return combinations;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -320,7 +320,7 @@ public abstract class AbstractPaceFunctions {
|
|||
}
|
||||
|
||||
//get the list of codes into the input string
|
||||
public List<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){
|
||||
public Set<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){
|
||||
|
||||
String s = cleanup(s1);
|
||||
|
||||
|
@ -328,7 +328,7 @@ public abstract class AbstractPaceFunctions {
|
|||
|
||||
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||
|
||||
List<String> codes = new ArrayList<>();
|
||||
Set<String> codes = new HashSet<>();
|
||||
|
||||
if (tokens.size()<windowSize)
|
||||
windowSize = tokens.size();
|
||||
|
|
|
@ -129,6 +129,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
System.out.println(s1);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s1))));
|
||||
|
||||
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
||||
System.out.println("s2 = " + s2);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s2))));
|
||||
|
||||
final String s3 = "universita universita milano milano";
|
||||
System.out.println("s3 = " + s3);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s3))));
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
#release configuration
|
||||
#Mon Jul 08 10:03:15 CEST 2019
|
||||
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
||||
pushChanges=true
|
||||
scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git
|
||||
preparationGoals=clean verify
|
||||
projectVersionPolicyId=default
|
||||
remoteTagging=true
|
||||
scm.commentPrefix=[maven-release-plugin]
|
||||
exec.snapshotReleasePluginAllowed=false
|
||||
completedPhase=create-backup-poms
|
Loading…
Reference in New Issue