forked from D-Net/dnet-hadoop
minor changes
This commit is contained in:
parent
9ddd24ba36
commit
9fee2ed611
|
@ -16,6 +16,7 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
|
|||
|
||||
@Override
|
||||
protected String getValue(final Field f) {
|
||||
|
||||
try {
|
||||
return asUrl(super.getValue(f)).getHost();
|
||||
} catch (MalformedURLException e) {
|
||||
|
|
|
@ -161,11 +161,14 @@ public class BlockProcessorForTesting {
|
|||
}
|
||||
else {
|
||||
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
|
||||
if(useTree)
|
||||
if (useTree)
|
||||
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
||||
else
|
||||
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
||||
}
|
||||
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
|
||||
// emitOutput(true, idPivot, idCurr, context);
|
||||
// }
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -180,44 +183,45 @@ public class BlockProcessorForTesting {
|
|||
return compare>=1.0;
|
||||
}
|
||||
|
||||
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
||||
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
||||
//if the score gives 1, the publications are equivalent
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
params.put("mode", "count");
|
||||
|
||||
boolean startLayer = false;
|
||||
boolean hardcheck1Layer = false;
|
||||
double score = 0.0;
|
||||
|
||||
//START - comparison of the PIDs json lists
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
|
||||
if (result >= 0.5) //if the result of the comparison is greater than the threshold
|
||||
startLayer = true;
|
||||
|
||||
//HARDCHECK1 - comparison of title versions and authors size
|
||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||
SizeMatch sizeMatch = new SizeMatch(params);
|
||||
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
||||
if (Math.min(result1, result2) != 0)
|
||||
hardcheck1Layer = true;
|
||||
|
||||
//SOFTCHECK and HARDCHECK2 - comparison of the titles
|
||||
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
||||
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||
double titleScore = Double.isNaN(result3)?0.0:result3;
|
||||
|
||||
if (startLayer) {
|
||||
return titleScore >= 0.90;
|
||||
}
|
||||
else {
|
||||
if (hardcheck1Layer) {
|
||||
return titleScore >= 0.99;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
//levenstein title
|
||||
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
||||
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
|
||||
score += 0.2;
|
||||
}
|
||||
|
||||
//pid
|
||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
|
||||
score += 0.5;
|
||||
}
|
||||
|
||||
//title version
|
||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||
if(result1<0 || result1>=1.0) {
|
||||
score += 0.1;
|
||||
}
|
||||
|
||||
//authors match
|
||||
params.remove("mode");
|
||||
AuthorsMatch authorsMatch = new AuthorsMatch(params);
|
||||
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
||||
if(result2 <0|| result2>=0.6) {
|
||||
score += 0.2;
|
||||
}
|
||||
|
||||
return score>=0.5;
|
||||
}
|
||||
|
||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||
|
||||
if (result) {
|
||||
|
|
|
@ -103,6 +103,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||
|
||||
params.put("len", 3);
|
||||
params.put("max", 1);
|
||||
|
||||
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -272,5 +272,17 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
assertEquals(1.0, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void domainExactMatch() {
|
||||
|
||||
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
|
||||
Field a = url("http://www.flowrepository.org");
|
||||
Field b = url("http://flowrepository.org/");
|
||||
|
||||
double compare = domainExactMatch.compare(a, b, conf);
|
||||
System.out.println("compare = " + compare);
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue