test implementation for the new fdup version

This commit is contained in:
miconis 2022-04-13 09:48:56 +02:00
parent 10172553ab
commit 97a32faf9b
1 changed files with 18 additions and 12 deletions

View File

@ -182,34 +182,40 @@ public class BlockProcessorForTesting {
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
double score = 0.0; boolean startLayer = false;
//LAYER 1 - comparison of the PIDs json lists boolean hardcheck1Layer = false;
//START - comparison of the PIDs json lists
Map<String, String> params = new HashMap<>(); Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value"); params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid"); params.put("jpath_classid", "$.qualifier.classid");
JsonListMatch jsonListMatch = new JsonListMatch(params); JsonListMatch jsonListMatch = new JsonListMatch(params);
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config); double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
if (result >= 0.5) //if the result of the comparison is greater than the threshold if (result >= 0.5) //if the result of the comparison is greater than the threshold
score += 10.0; //high score because it should match when the first condition is satisfied startLayer = true;
else
score += 0.0;
//LAYER 2 - comparison of the title version and the size of the authors lists //HARDCHECK1 - comparison of title versions and authors size
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
SizeMatch sizeMatch = new SizeMatch(params); SizeMatch sizeMatch = new SizeMatch(params);
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if (Math.min(result1, result2) != 0) if (Math.min(result1, result2) != 0)
score+=0; hardcheck1Layer = true;
else
score-=2;
//LAYER 3 - computation of levenshtein on titles //SOFTCHECK and HARDCHECK2 - comparison of the titles
LevensteinTitle levensteinTitle = new LevensteinTitle(params); LevensteinTitle levensteinTitle = new LevensteinTitle(params);
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
score += Double.isNaN(result3)?0.0:result3; double titleScore = Double.isNaN(result3)?0.0:result3;
return score >= 0.99; if (startLayer) {
return titleScore >= 0.90;
}
else {
if (hardcheck1Layer) {
return titleScore >= 0.99;
}
}
return false;
} }
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {