minor changes

This commit is contained in:
Michele De Bonis 2022-11-21 14:35:46 +01:00
parent 9ddd24ba36
commit 9fee2ed611
4 changed files with 57 additions and 35 deletions

View File

@ -16,6 +16,7 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
@Override @Override
protected String getValue(final Field f) { protected String getValue(final Field f) {
try { try {
return asUrl(super.getValue(f)).getHost(); return asUrl(super.getValue(f)).getHost();
} catch (MalformedURLException e) { } catch (MalformedURLException e) {

View File

@ -161,11 +161,14 @@ public class BlockProcessorForTesting {
} }
else { else {
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications) //use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
if(useTree) if (useTree)
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
else else
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
} }
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
// emitOutput(true, idPivot, idCurr, context);
// }
} }
} }
@ -180,44 +183,45 @@ public class BlockProcessorForTesting {
return compare>=1.0; return compare>=1.0;
} }
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
//if the score gives 1, the publications are equivalent
Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
params.put("mode", "count");
boolean startLayer = false; double score = 0.0;
boolean hardcheck1Layer = false;
//START - comparison of the PIDs json lists //levenstein title
Map<String, String> params = new HashMap<>(); LevensteinTitle levensteinTitle = new LevensteinTitle(params);
params.put("jpath_value", "$.value"); if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
params.put("jpath_classid", "$.qualifier.classid"); score += 0.2;
JsonListMatch jsonListMatch = new JsonListMatch(params);
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
if (result >= 0.5) //if the result of the comparison is greater than the threshold
startLayer = true;
//HARDCHECK1 - comparison of title versions and authors size
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
SizeMatch sizeMatch = new SizeMatch(params);
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if (Math.min(result1, result2) != 0)
hardcheck1Layer = true;
//SOFTCHECK and HARDCHECK2 - comparison of the titles
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
double titleScore = Double.isNaN(result3)?0.0:result3;
if (startLayer) {
return titleScore >= 0.90;
}
else {
if (hardcheck1Layer) {
return titleScore >= 0.99;
}
}
return false;
} }
//pid
JsonListMatch jsonListMatch = new JsonListMatch(params);
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
score += 0.5;
}
//title version
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
if(result1<0 || result1>=1.0) {
score += 0.1;
}
//authors match
params.remove("mode");
AuthorsMatch authorsMatch = new AuthorsMatch(params);
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if(result2 <0|| result2>=0.6) {
score += 0.2;
}
return score>=0.5;
}
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
if (result) { if (result) {

View File

@ -103,6 +103,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson"; final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s); System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
params.put("len", 3);
params.put("max", 1);
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
} }
@Test @Test

View File

@ -272,5 +272,17 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, result); assertEquals(1.0, result);
} }
@Test
public void domainExactMatch() {
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
Field a = url("http://www.flowrepository.org");
Field b = url("http://flowrepository.org/");
double compare = domainExactMatch.compare(a, b, conf);
System.out.println("compare = " + compare);
}
} }