forked from D-Net/dnet-hadoop
minor changes
This commit is contained in:
parent
9ddd24ba36
commit
9fee2ed611
|
@ -16,6 +16,7 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String getValue(final Field f) {
|
protected String getValue(final Field f) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return asUrl(super.getValue(f)).getHost();
|
return asUrl(super.getValue(f)).getHost();
|
||||||
} catch (MalformedURLException e) {
|
} catch (MalformedURLException e) {
|
||||||
|
|
|
@ -161,11 +161,14 @@ public class BlockProcessorForTesting {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
|
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
|
||||||
if(useTree)
|
if (useTree)
|
||||||
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
||||||
else
|
else
|
||||||
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
||||||
}
|
}
|
||||||
|
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
|
||||||
|
// emitOutput(true, idPivot, idCurr, context);
|
||||||
|
// }
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -180,44 +183,45 @@ public class BlockProcessorForTesting {
|
||||||
return compare>=1.0;
|
return compare>=1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
||||||
|
//if the score gives 1, the publications are equivalent
|
||||||
|
Map<String, String> params = new HashMap<>();
|
||||||
|
params.put("jpath_value", "$.value");
|
||||||
|
params.put("jpath_classid", "$.qualifier.classid");
|
||||||
|
params.put("mode", "count");
|
||||||
|
|
||||||
boolean startLayer = false;
|
double score = 0.0;
|
||||||
boolean hardcheck1Layer = false;
|
|
||||||
|
|
||||||
//START - comparison of the PIDs json lists
|
//levenstein title
|
||||||
Map<String, String> params = new HashMap<>();
|
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
||||||
params.put("jpath_value", "$.value");
|
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
|
||||||
params.put("jpath_classid", "$.qualifier.classid");
|
score += 0.2;
|
||||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
|
||||||
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
|
|
||||||
if (result >= 0.5) //if the result of the comparison is greater than the threshold
|
|
||||||
startLayer = true;
|
|
||||||
|
|
||||||
//HARDCHECK1 - comparison of title versions and authors size
|
|
||||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
|
||||||
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
|
||||||
SizeMatch sizeMatch = new SizeMatch(params);
|
|
||||||
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
|
||||||
if (Math.min(result1, result2) != 0)
|
|
||||||
hardcheck1Layer = true;
|
|
||||||
|
|
||||||
//SOFTCHECK and HARDCHECK2 - comparison of the titles
|
|
||||||
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
|
||||||
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
|
||||||
double titleScore = Double.isNaN(result3)?0.0:result3;
|
|
||||||
|
|
||||||
if (startLayer) {
|
|
||||||
return titleScore >= 0.90;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (hardcheck1Layer) {
|
|
||||||
return titleScore >= 0.99;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//pid
|
||||||
|
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||||
|
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
|
||||||
|
score += 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
//title version
|
||||||
|
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||||
|
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||||
|
if(result1<0 || result1>=1.0) {
|
||||||
|
score += 0.1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//authors match
|
||||||
|
params.remove("mode");
|
||||||
|
AuthorsMatch authorsMatch = new AuthorsMatch(params);
|
||||||
|
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
||||||
|
if(result2 <0|| result2>=0.6) {
|
||||||
|
score += 0.2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return score>=0.5;
|
||||||
|
}
|
||||||
|
|
||||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||||
|
|
||||||
if (result) {
|
if (result) {
|
||||||
|
|
|
@ -103,6 +103,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||||
|
|
||||||
|
params.put("len", 3);
|
||||||
|
params.put("max", 1);
|
||||||
|
|
||||||
|
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -272,5 +272,17 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void domainExactMatch() {
|
||||||
|
|
||||||
|
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
|
||||||
|
Field a = url("http://www.flowrepository.org");
|
||||||
|
Field b = url("http://flowrepository.org/");
|
||||||
|
|
||||||
|
double compare = domainExactMatch.compare(a, b, conf);
|
||||||
|
System.out.println("compare = " + compare);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue