forked from D-Net/dnet-hadoop
test implementation for the new fdup version
This commit is contained in:
parent
10172553ab
commit
97a32faf9b
|
@ -182,34 +182,40 @@ public class BlockProcessorForTesting {
|
||||||
|
|
||||||
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
||||||
|
|
||||||
double score = 0.0;
|
boolean startLayer = false;
|
||||||
//LAYER 1 - comparison of the PIDs json lists
|
boolean hardcheck1Layer = false;
|
||||||
|
|
||||||
|
//START - comparison of the PIDs json lists
|
||||||
Map<String, String> params = new HashMap<>();
|
Map<String, String> params = new HashMap<>();
|
||||||
params.put("jpath_value", "$.value");
|
params.put("jpath_value", "$.value");
|
||||||
params.put("jpath_classid", "$.qualifier.classid");
|
params.put("jpath_classid", "$.qualifier.classid");
|
||||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||||
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
|
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
|
||||||
if (result >= 0.5) //if the result of the comparison is greater than the threshold
|
if (result >= 0.5) //if the result of the comparison is greater than the threshold
|
||||||
score += 10.0; //high score because it should match when the first condition is satisfied
|
startLayer = true;
|
||||||
else
|
|
||||||
score += 0.0;
|
|
||||||
|
|
||||||
//LAYER 2 - comparison of the title version and the size of the authors lists
|
//HARDCHECK1 - comparison of title versions and authors size
|
||||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||||
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||||
SizeMatch sizeMatch = new SizeMatch(params);
|
SizeMatch sizeMatch = new SizeMatch(params);
|
||||||
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
||||||
if (Math.min(result1, result2) != 0)
|
if (Math.min(result1, result2) != 0)
|
||||||
score+=0;
|
hardcheck1Layer = true;
|
||||||
else
|
|
||||||
score-=2;
|
|
||||||
|
|
||||||
//LAYER 3 - computation of levenshtein on titles
|
//SOFTCHECK and HARDCHECK2 - comparison of the titles
|
||||||
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
||||||
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||||
score += Double.isNaN(result3)?0.0:result3;
|
double titleScore = Double.isNaN(result3)?0.0:result3;
|
||||||
|
|
||||||
return score >= 0.99;
|
if (startLayer) {
|
||||||
|
return titleScore >= 0.90;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (hardcheck1Layer) {
|
||||||
|
return titleScore >= 0.99;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||||
|
|
Loading…
Reference in New Issue