forked from D-Net/dnet-hadoop
addition of cities check
This commit is contained in:
parent
72a9b3139e
commit
3d4372ced9
|
@ -69,6 +69,11 @@
|
|||
<artifactId>jackson-mapper-asl</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-math3</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ import eu.dnetlib.pace.model.FieldListImpl;
|
|||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
|
@ -171,7 +172,7 @@ public abstract class AbstractPaceFunctions {
|
|||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (String key: line){
|
||||
m.put(fixAliases(key),value);
|
||||
m.put(fixAliases(key).toLowerCase(),value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e){
|
||||
|
@ -191,17 +192,21 @@ public abstract class AbstractPaceFunctions {
|
|||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
//TODO remove also codes of the cities
|
||||
public String normalizeCities(String s1, Map<String,String> cityMap){
|
||||
for (String city : cityMap.keySet())
|
||||
s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " ");
|
||||
return s1;
|
||||
}
|
||||
|
||||
public String removeCodes(String s) {
|
||||
final String regex = "\\bkey::[0-9]*\\b";
|
||||
return s.replaceAll(regex, "").trim();
|
||||
final String regexKey = "\\bkey::[0-9]*\\b";
|
||||
final String regexCity = "\\bcity::[0-9]*\\b";
|
||||
return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim();
|
||||
|
||||
}
|
||||
|
||||
//check if 2 strings have same keywords
|
||||
public boolean sameKeywords(String s1, String s2){
|
||||
//all keywords in common
|
||||
//return getKeywords(s1).containsAll(getKeywords(s2)) && getKeywords(s2).containsAll(getKeywords(s1));
|
||||
|
||||
//at least 1 keyword in common
|
||||
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty())
|
||||
return true;
|
||||
|
@ -209,11 +214,36 @@ public abstract class AbstractPaceFunctions {
|
|||
return CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()>0;
|
||||
}
|
||||
|
||||
//returns true if at least 1 city is in common
|
||||
//returns true if a name has no cities
|
||||
public boolean sameCity(String s1, String s2){
|
||||
|
||||
if (getCities(s1).isEmpty() || getCities(s2).isEmpty())
|
||||
return true;
|
||||
else
|
||||
return CollectionUtils.intersection(getCities(s1), getCities(s2)).size()>0;
|
||||
}
|
||||
|
||||
//get the list of keywords in a string
|
||||
public List<String> getCities(String s) {
|
||||
|
||||
final String regex = "\\bcity::[0-9]*\\b";
|
||||
|
||||
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
|
||||
Matcher m = p.matcher(s);
|
||||
List<String> codes = new ArrayList<>();
|
||||
while (m.find()) {
|
||||
codes.add(m.group(0));
|
||||
for (int i = 1; i <= m.groupCount(); i++) {
|
||||
codes.add(m.group(0));
|
||||
}
|
||||
}
|
||||
return codes;
|
||||
}
|
||||
|
||||
//get the list of keywords in a string
|
||||
public List<String> getKeywords(String s) {
|
||||
|
||||
// final String regex = " \\d+ ";
|
||||
|
||||
final String regex = "\\bkey::[0-9]*\\b";
|
||||
|
||||
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
|
||||
|
|
|
@ -5,7 +5,6 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
|||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -22,6 +21,8 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
|||
//key=word, value=global identifier => example: "università"->"university", used to substitute the word with the global identifier
|
||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||
|
||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
public JaroWinklerNormalizedName(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
@ -46,10 +47,22 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
|||
ca = translate(ca, translationMap);
|
||||
cb = translate(cb, translationMap);
|
||||
|
||||
if (sameKeywords(ca,cb)) {
|
||||
return normalize(ssalgo.score(removeCodes(ca), removeCodes(cb)));
|
||||
String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
|
||||
ca = norm.split("\\|\\|\\|")[0].trim();
|
||||
cb = norm.split("\\|\\|\\|")[1].trim();
|
||||
|
||||
if (sameCity(ca,cb)){
|
||||
if (sameKeywords(ca,cb)){
|
||||
ca = removeCodes(ca);
|
||||
cb = removeCodes(cb);
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca,cb));
|
||||
}
|
||||
}
|
||||
return 0.0;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -41,7 +41,7 @@ public class BlockProcessor {
|
|||
final Queue<MapDocument> q = prepare(documents);
|
||||
|
||||
if (q.size() > 1) {
|
||||
log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
//process(q, context);
|
||||
process(simplifyQueue(q, key, context), context);
|
||||
} else {
|
||||
|
@ -109,7 +109,7 @@ public class BlockProcessor {
|
|||
q.addAll(tempResults);
|
||||
} else {
|
||||
context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size());
|
||||
log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
|
||||
// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -150,7 +150,7 @@ public class BlockProcessor {
|
|||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
|
||||
final ScoreResult sr = similarity(algo, pivot, curr);
|
||||
log.info(sr.toString()+"SCORE "+ sr.getScore());
|
||||
// log.info(sr.toString()+"SCORE "+ sr.getScore());
|
||||
emitOutput(sr, idPivot, idCurr, context);
|
||||
i++;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,16 +1,16 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
|
||||
public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||
|
||||
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
||||
|
@ -47,9 +47,18 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
@Test
|
||||
public void testJaroWinklerNormalizedName() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("universita degli studi di genova", "universita di genova");
|
||||
double result = jaroWinklerNormalizedName.distance("Universita di Pisa", "Universita di Parma");
|
||||
|
||||
System.out.println(result);
|
||||
assertEquals(result, 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName2() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("University of Pisa", "Universita degli studi di Pisa");
|
||||
|
||||
assertEquals(result, 1.0);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue