forked from D-Net/dnet-hadoop
addition of cities check
This commit is contained in:
parent
72a9b3139e
commit
3d4372ced9
|
@ -69,6 +69,11 @@
|
||||||
<artifactId>jackson-mapper-asl</artifactId>
|
<artifactId>jackson-mapper-asl</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-math3</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ import eu.dnetlib.pace.model.FieldListImpl;
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
|
||||||
|
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -171,7 +172,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
String[] line = s.split(";");
|
String[] line = s.split(";");
|
||||||
String value = line[0];
|
String value = line[0];
|
||||||
for (String key: line){
|
for (String key: line){
|
||||||
m.put(fixAliases(key),value);
|
m.put(fixAliases(key).toLowerCase(),value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (final Throwable e){
|
} catch (final Throwable e){
|
||||||
|
@ -191,17 +192,21 @@ public abstract class AbstractPaceFunctions {
|
||||||
return sb.toString().trim();
|
return sb.toString().trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO remove also codes of the cities
|
public String normalizeCities(String s1, Map<String,String> cityMap){
|
||||||
|
for (String city : cityMap.keySet())
|
||||||
|
s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " ");
|
||||||
|
return s1;
|
||||||
|
}
|
||||||
|
|
||||||
public String removeCodes(String s) {
|
public String removeCodes(String s) {
|
||||||
final String regex = "\\bkey::[0-9]*\\b";
|
final String regexKey = "\\bkey::[0-9]*\\b";
|
||||||
return s.replaceAll(regex, "").trim();
|
final String regexCity = "\\bcity::[0-9]*\\b";
|
||||||
|
return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//check if 2 strings have same keywords
|
//check if 2 strings have same keywords
|
||||||
public boolean sameKeywords(String s1, String s2){
|
public boolean sameKeywords(String s1, String s2){
|
||||||
//all keywords in common
|
|
||||||
//return getKeywords(s1).containsAll(getKeywords(s2)) && getKeywords(s2).containsAll(getKeywords(s1));
|
|
||||||
|
|
||||||
//at least 1 keyword in common
|
//at least 1 keyword in common
|
||||||
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty())
|
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty())
|
||||||
return true;
|
return true;
|
||||||
|
@ -209,11 +214,36 @@ public abstract class AbstractPaceFunctions {
|
||||||
return CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()>0;
|
return CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()>0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//returns true if at least 1 city is in common
|
||||||
|
//returns true if a name has no cities
|
||||||
|
public boolean sameCity(String s1, String s2){
|
||||||
|
|
||||||
|
if (getCities(s1).isEmpty() || getCities(s2).isEmpty())
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
return CollectionUtils.intersection(getCities(s1), getCities(s2)).size()>0;
|
||||||
|
}
|
||||||
|
|
||||||
|
//get the list of keywords in a string
|
||||||
|
public List<String> getCities(String s) {
|
||||||
|
|
||||||
|
final String regex = "\\bcity::[0-9]*\\b";
|
||||||
|
|
||||||
|
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
|
||||||
|
Matcher m = p.matcher(s);
|
||||||
|
List<String> codes = new ArrayList<>();
|
||||||
|
while (m.find()) {
|
||||||
|
codes.add(m.group(0));
|
||||||
|
for (int i = 1; i <= m.groupCount(); i++) {
|
||||||
|
codes.add(m.group(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return codes;
|
||||||
|
}
|
||||||
|
|
||||||
//get the list of keywords in a string
|
//get the list of keywords in a string
|
||||||
public List<String> getKeywords(String s) {
|
public List<String> getKeywords(String s) {
|
||||||
|
|
||||||
// final String regex = " \\d+ ";
|
|
||||||
|
|
||||||
final String regex = "\\bkey::[0-9]*\\b";
|
final String regex = "\\bkey::[0-9]*\\b";
|
||||||
|
|
||||||
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
|
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
|
||||||
|
|
|
@ -5,7 +5,6 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ -22,6 +21,8 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||||
//key=word, value=global identifier => example: "università"->"university", used to substitute the word with the global identifier
|
//key=word, value=global identifier => example: "università"->"university", used to substitute the word with the global identifier
|
||||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||||
|
|
||||||
|
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||||
|
|
||||||
public JaroWinklerNormalizedName(Map<String, Number> params){
|
public JaroWinklerNormalizedName(Map<String, Number> params){
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
@ -46,10 +47,22 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||||
ca = translate(ca, translationMap);
|
ca = translate(ca, translationMap);
|
||||||
cb = translate(cb, translationMap);
|
cb = translate(cb, translationMap);
|
||||||
|
|
||||||
if (sameKeywords(ca,cb)) {
|
String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
|
||||||
return normalize(ssalgo.score(removeCodes(ca), removeCodes(cb)));
|
ca = norm.split("\\|\\|\\|")[0].trim();
|
||||||
|
cb = norm.split("\\|\\|\\|")[1].trim();
|
||||||
|
|
||||||
|
if (sameCity(ca,cb)){
|
||||||
|
if (sameKeywords(ca,cb)){
|
||||||
|
ca = removeCodes(ca);
|
||||||
|
cb = removeCodes(cb);
|
||||||
|
if (ca.isEmpty() && cb.isEmpty())
|
||||||
|
return 1.0;
|
||||||
|
else
|
||||||
|
return normalize(ssalgo.score(ca,cb));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -41,7 +41,7 @@ public class BlockProcessor {
|
||||||
final Queue<MapDocument> q = prepare(documents);
|
final Queue<MapDocument> q = prepare(documents);
|
||||||
|
|
||||||
if (q.size() > 1) {
|
if (q.size() > 1) {
|
||||||
log.info("reducing key: '" + key + "' records: " + q.size());
|
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||||
//process(q, context);
|
//process(q, context);
|
||||||
process(simplifyQueue(q, key, context), context);
|
process(simplifyQueue(q, key, context), context);
|
||||||
} else {
|
} else {
|
||||||
|
@ -109,7 +109,7 @@ public class BlockProcessor {
|
||||||
q.addAll(tempResults);
|
q.addAll(tempResults);
|
||||||
} else {
|
} else {
|
||||||
context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size());
|
context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size());
|
||||||
log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
|
// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -150,7 +150,7 @@ public class BlockProcessor {
|
||||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||||
|
|
||||||
final ScoreResult sr = similarity(algo, pivot, curr);
|
final ScoreResult sr = similarity(algo, pivot, curr);
|
||||||
log.info(sr.toString()+"SCORE "+ sr.getScore());
|
// log.info(sr.toString()+"SCORE "+ sr.getScore());
|
||||||
emitOutput(sr, idPivot, idCurr, context);
|
emitOutput(sr, idPivot, idCurr, context);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,16 +1,16 @@
|
||||||
package eu.dnetlib.pace.distance;
|
package eu.dnetlib.pace.distance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
|
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
|
||||||
import org.apache.commons.lang.StringUtils;
|
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static junit.framework.Assert.assertEquals;
|
||||||
|
|
||||||
public class DistanceAlgoTest extends AbstractPaceFunctions {
|
public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
||||||
|
@ -47,9 +47,18 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
@Test
|
@Test
|
||||||
public void testJaroWinklerNormalizedName() {
|
public void testJaroWinklerNormalizedName() {
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
double result = jaroWinklerNormalizedName.distance("universita degli studi di genova", "universita di genova");
|
double result = jaroWinklerNormalizedName.distance("Universita di Pisa", "Universita di Parma");
|
||||||
|
|
||||||
System.out.println(result);
|
assertEquals(result, 0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testJaroWinklerNormalizedName2() {
|
||||||
|
|
||||||
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
double result = jaroWinklerNormalizedName.distance("University of Pisa", "Universita degli studi di Pisa");
|
||||||
|
|
||||||
|
assertEquals(result, 1.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue