addition of cities check

This commit is contained in:
Michele De Bonis 2018-11-16 16:11:03 +01:00
parent 3a517a6551
commit 23c5a16525
10 changed files with 10684 additions and 23 deletions

View File

@ -4,6 +4,7 @@ import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
@ -34,7 +35,7 @@ public class SparkTest {
public static void main(String[] args) {
final JavaSparkContext context = new JavaSparkContext(new SparkConf().setAppName("Deduplication").setMaster("local[*]"));
final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/orgs.json");
final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/orgs2.json");
final JavaRDD<String> dataRDD = context.textFile(dataset.getPath());
counter = new SparkCounter(context);
@ -85,12 +86,12 @@ public class SparkTest {
counter.getAccumulators().values().forEach(it-> System.out.println(it.getGroup()+" "+it.getName()+" -->"+it.value()));
connectedComponents.foreach(cc -> System.out.println("cc = " + cc.toString() + " size =" + cc.getDocs().size()));
nonDeduplicated.foreach(cc -> System.out.println("nd = " + cc.toString()));
// connectedComponents.foreach(cc -> System.out.println("cc = " + cc.toString() + " size =" + cc.getDocs().size()));
// nonDeduplicated.foreach(cc -> System.out.println("nd = " + cc.toString()));
//print ids
// ccs.foreach(cc -> System.out.println(cc.getId()));
// ccs.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup");
// connectedComponents.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup");
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -69,6 +69,11 @@
<artifactId>jackson-mapper-asl</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</dependency>
</dependencies>

View File

@ -12,6 +12,7 @@ import eu.dnetlib.pace.model.FieldListImpl;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import java.text.Normalizer;
import java.util.*;
@ -171,7 +172,7 @@ public abstract class AbstractPaceFunctions {
String[] line = s.split(";");
String value = line[0];
for (String key: line){
m.put(fixAliases(key),value);
m.put(fixAliases(key).toLowerCase(),value);
}
}
} catch (final Throwable e){
@ -191,17 +192,21 @@ public abstract class AbstractPaceFunctions {
return sb.toString().trim();
}
//TODO remove also codes of the cities
public String normalizeCities(String s1, Map<String,String> cityMap){
for (String city : cityMap.keySet())
s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " ");
return s1;
}
public String removeCodes(String s) {
final String regex = "\\bkey::[0-9]*\\b";
return s.replaceAll(regex, "").trim();
final String regexKey = "\\bkey::[0-9]*\\b";
final String regexCity = "\\bcity::[0-9]*\\b";
return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim();
}
//check if 2 strings have same keywords
public boolean sameKeywords(String s1, String s2){
//all keywords in common
//return getKeywords(s1).containsAll(getKeywords(s2)) && getKeywords(s2).containsAll(getKeywords(s1));
//at least 1 keyword in common
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty())
return true;
@ -209,11 +214,36 @@ public abstract class AbstractPaceFunctions {
return CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()>0;
}
//returns true if at least 1 city is in common
//returns true if a name has no cities
public boolean sameCity(String s1, String s2){
if (getCities(s1).isEmpty() || getCities(s2).isEmpty())
return true;
else
return CollectionUtils.intersection(getCities(s1), getCities(s2)).size()>0;
}
//get the list of keywords in a string
public List<String> getCities(String s) {
final String regex = "\\bcity::[0-9]*\\b";
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
Matcher m = p.matcher(s);
List<String> codes = new ArrayList<>();
while (m.find()) {
codes.add(m.group(0));
for (int i = 1; i <= m.groupCount(); i++) {
codes.add(m.group(0));
}
}
return codes;
}
//get the list of keywords in a string
public List<String> getKeywords(String s) {
// final String regex = " \\d+ ";
final String regex = "\\bkey::[0-9]*\\b";
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);

View File

@ -5,7 +5,6 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
@ -22,6 +21,8 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
//key=word, value=global identifier => example: "università"->"university", used to substitute the word with the global identifier
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
public JaroWinklerNormalizedName(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
@ -46,10 +47,22 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
ca = translate(ca, translationMap);
cb = translate(cb, translationMap);
if (sameKeywords(ca,cb)) {
return normalize(ssalgo.score(removeCodes(ca), removeCodes(cb)));
String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
ca = norm.split("\\|\\|\\|")[0].trim();
cb = norm.split("\\|\\|\\|")[1].trim();
if (sameCity(ca,cb)){
if (sameKeywords(ca,cb)){
ca = removeCodes(ca);
cb = removeCodes(cb);
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca,cb));
}
}
return 0.0;
}
@Override

View File

@ -41,7 +41,7 @@ public class BlockProcessor {
final Queue<MapDocument> q = prepare(documents);
if (q.size() > 1) {
log.info("reducing key: '" + key + "' records: " + q.size());
// log.info("reducing key: '" + key + "' records: " + q.size());
//process(q, context);
process(simplifyQueue(q, key, context), context);
} else {
@ -109,7 +109,7 @@ public class BlockProcessor {
q.addAll(tempResults);
} else {
context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size());
log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
}
}
@ -150,7 +150,7 @@ public class BlockProcessor {
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
final ScoreResult sr = similarity(algo, pivot, curr);
log.info(sr.toString()+"SCORE "+ sr.getScore());
// log.info(sr.toString()+"SCORE "+ sr.getScore());
emitOutput(sr, idPivot, idCurr, context);
i++;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,16 +1,16 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
import org.apache.commons.lang.StringUtils;
import org.junit.Before;
import org.junit.Test;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static junit.framework.Assert.assertEquals;
public class DistanceAlgoTest extends AbstractPaceFunctions {
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
@ -47,9 +47,18 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
@Test
public void testJaroWinklerNormalizedName() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("universita degli studi di genova", "universita di genova");
double result = jaroWinklerNormalizedName.distance("Universita di Pisa", "Universita di Parma");
System.out.println(result);
assertEquals(result, 0.0);
}
@Test
public void testJaroWinklerNormalizedName2() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("University of Pisa", "Universita degli studi di Pisa");
assertEquals(result, 1.0);
}
}

View File

@ -136,6 +136,11 @@
<version>1.9.13</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.6.1</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>