added DiffPatchMatch utility. Resumed commented tests!
This commit is contained in:
parent
7c59c3ebf0
commit
9f513352fb
|
@ -22,22 +22,24 @@ import scala.Tuple2;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.net.URL;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
public class SparkTest {
|
||||
public static SparkCounter counter ;
|
||||
private static final Log log = LogFactory.getLog(SparkTest.class);
|
||||
|
||||
public static void main(String[] args) {
|
||||
final JavaSparkContext context = new JavaSparkContext(new SparkConf().setAppName("Deduplication").setMaster("local[*]"));
|
||||
final JavaRDD<String> dataRDD = context.textFile("file:///Users/miconis/Downloads/dumps/organizations_sample.json");
|
||||
|
||||
final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/orgs.test.json");
|
||||
final JavaRDD<String> dataRDD = context.textFile(dataset.getPath());
|
||||
|
||||
counter = new SparkCounter(context);
|
||||
|
||||
//read the configuration from the classpath
|
||||
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
||||
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.test.pace.conf"));
|
||||
|
||||
BlockProcessor.constructAccumulator(config);
|
||||
BlockProcessor.accumulators.forEach(acc -> {
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.98",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "10",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } },
|
||||
{ "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }
|
||||
],
|
||||
"conditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "legalshortname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
|
||||
{ "name" : "websiteurl", "algo" : "urlMatcher", "type" : "URL", "weight" : "0.4", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : {"host":"1.0", "path":"0.0"} }
|
||||
],
|
||||
"blacklists" : { }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.98",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "10",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }
|
||||
],
|
||||
"conditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "legalname", "algo" : "Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }
|
||||
],
|
||||
"blacklists" : { }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
{"dateoftransformation":"2018-09-17","originalId":["corda__h2020::999838074"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI VERONA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::3898c35de19616484a0e901a92a709f5"}
|
||||
{"dateoftransformation":"2018-09-13","originalId":["corda_______::999838074"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI VERONA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::3898c35de19616484a0e901a92a709f5"}
|
||||
{"dateoftransformation":"2018-09-17","originalId":["corda__h2020::999976687"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIGE"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unige.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI GENOVA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::67fba37704a39567853e54615c5371fe"}
|
||||
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Università_degli_Studi_di_Genova"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"https://www.unige.it/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Genova"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-09-01","type":20,"id":"20|opendoar____::fcd6c93c2863e6be9c6f6a66d761c92d"}
|
||||
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Università_degli_Studi_di_Verona"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Verona"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::3f2f26e4bf71340e806ec956884fe34e"}
|
|
@ -0,0 +1,2 @@
|
|||
{"dateoftransformation":"2018-09-17","originalId":["corda__h2020::999838074"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"VERONA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::3898c35de19616484a0e901a92a709f5"}
|
||||
{"dateoftransformation":"2018-09-13","originalId":["corda_______::999838074"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"GENOVA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::3898c35de19616484a0e901a92a709f5"}
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.99",
|
||||
"run" : "001",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "result",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
|
|
|
@ -1,208 +1,212 @@
|
|||
//package eu.dnetlib.pace;
|
||||
//
|
||||
//import com.google.common.collect.Lists;
|
||||
//import com.google.common.collect.Sets;
|
||||
//import com.google.gson.Gson;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder;
|
||||
//import eu.dnetlib.data.proto.OafProtos.Oaf;
|
||||
//import eu.dnetlib.data.proto.OafProtos.OafEntity;
|
||||
//import eu.dnetlib.data.proto.OrganizationProtos.Organization;
|
||||
//import eu.dnetlib.data.proto.ResultProtos.Result;
|
||||
//import eu.dnetlib.pace.config.Config;
|
||||
//import eu.dnetlib.pace.config.DedupConfig;
|
||||
//import eu.dnetlib.pace.config.Type;
|
||||
//import eu.dnetlib.pace.model.Field;
|
||||
//import eu.dnetlib.pace.model.FieldValueImpl;
|
||||
//import eu.dnetlib.pace.model.MapDocument;
|
||||
//import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
||||
//import eu.dnetlib.pace.model.gt.GTAuthor;
|
||||
//import org.apache.commons.io.IOUtils;
|
||||
//import org.apache.commons.lang.RandomStringUtils;
|
||||
//import org.apache.commons.lang.StringUtils;
|
||||
//import org.apache.commons.lang3.RandomUtils;
|
||||
//
|
||||
//import java.io.IOException;
|
||||
//import java.io.StringWriter;
|
||||
//import java.util.ArrayList;
|
||||
//import java.util.LinkedList;
|
||||
//import java.util.List;
|
||||
//import java.util.Set;
|
||||
//import java.util.stream.Collectors;
|
||||
//import java.util.stream.IntStream;
|
||||
//
|
||||
//public abstract class AbstractProtoPaceTest extends OafTest {
|
||||
//
|
||||
// protected DedupConfig getResultFullConf() {
|
||||
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
|
||||
// }
|
||||
//
|
||||
// protected DedupConfig getResultSimpleConf() {
|
||||
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf"));
|
||||
// }
|
||||
//
|
||||
// protected DedupConfig getResultConf() {
|
||||
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf"));
|
||||
// }
|
||||
//
|
||||
// protected DedupConfig getOrganizationSimpleConf() {
|
||||
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
||||
// }
|
||||
//
|
||||
// protected DedupConfig getResultAuthorsConf() {
|
||||
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf"));
|
||||
// }
|
||||
//
|
||||
// protected DedupConfig getResultProdConf() {
|
||||
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf"));
|
||||
// }
|
||||
//
|
||||
// protected MapDocument author(final Config conf, final String id, final Oaf oaf) {
|
||||
// return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model());
|
||||
// }
|
||||
//
|
||||
// protected GTAuthor getGTAuthor(final String path) {
|
||||
//
|
||||
// final Gson gson = new Gson();
|
||||
//
|
||||
// final String json = readFromClasspath(path);
|
||||
//
|
||||
// final GTAuthor gta = gson.fromJson(json, GTAuthor.class);
|
||||
//
|
||||
// return gta;
|
||||
// }
|
||||
//
|
||||
// protected String readFromClasspath(final String filename) {
|
||||
// final StringWriter sw = new StringWriter();
|
||||
// try {
|
||||
// IOUtils.copy(getClass().getResourceAsStream(filename), sw);
|
||||
// return sw.toString();
|
||||
// } catch (final IOException e) {
|
||||
// throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// protected MapDocument result(final Config config, final String id, final String title) {
|
||||
// return result(config, id, title, null, new ArrayList<>(), null);
|
||||
// }
|
||||
//
|
||||
// protected MapDocument result(final Config config, final String id, final String title, final String date) {
|
||||
// return result(config, id, title, date, new ArrayList<>(), null);
|
||||
// }
|
||||
//
|
||||
// protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid) {
|
||||
// return result(config, id, title, date, pid, null);
|
||||
// }
|
||||
//
|
||||
// protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) {
|
||||
// return result(config, id, title, date, pid, null);
|
||||
// }
|
||||
//
|
||||
// protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List<String> authors) {
|
||||
// return result(config, id, title, date, Lists.newArrayList(pid), authors);
|
||||
// }
|
||||
//
|
||||
// static List<String> pidTypes = Lists.newArrayList();
|
||||
// static {
|
||||
// pidTypes.add("doi");
|
||||
// //pidTypes.add("oai");
|
||||
// //pidTypes.add("pmid");
|
||||
// }
|
||||
//
|
||||
// protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
|
||||
// final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
|
||||
// if (!StringUtils.isBlank(title)) {
|
||||
// metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles")));
|
||||
// metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles")));
|
||||
// }
|
||||
// if (!StringUtils.isBlank(date)) {
|
||||
// metadata.setDateofacceptance(sf(date));
|
||||
// }
|
||||
//
|
||||
// final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
|
||||
// final Result.Builder result = Result.newBuilder().setMetadata(metadata);
|
||||
//
|
||||
// if (authors != null) {
|
||||
// result.getMetadataBuilder().addAllAuthor(
|
||||
// IntStream.range(0, authors.size())
|
||||
// .mapToObj(i -> author(authors.get(i), i))
|
||||
// .collect(Collectors.toCollection(LinkedList::new)));
|
||||
// }
|
||||
//
|
||||
// entity.setResult(result);
|
||||
//
|
||||
// if (pid != null) {
|
||||
// for(String p : pid) {
|
||||
// if (!StringUtils.isBlank(p)) {
|
||||
// entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1))));
|
||||
// //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// final OafEntity build = entity.build();
|
||||
// return ProtoDocumentBuilder.newInstance(id, build, config.model());
|
||||
// }
|
||||
//
|
||||
// private Author author(final String s, int rank) {
|
||||
// final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false);
|
||||
// final Author.Builder author = Author.newBuilder();
|
||||
// if (p.isAccurate()) {
|
||||
// author.setName(p.getNormalisedFirstName());
|
||||
// author.setSurname(p.getNormalisedSurname());
|
||||
// }
|
||||
// author.setFullname(p.getNormalisedFullname());
|
||||
// author.setRank(rank);
|
||||
//
|
||||
// return author.build();
|
||||
// }
|
||||
//
|
||||
// private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) {
|
||||
// final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type);
|
||||
// return entity;
|
||||
// }
|
||||
//
|
||||
// protected MapDocument organization(final Config config, final String id, final String legalName) {
|
||||
// return organization(config, id, legalName, null);
|
||||
// }
|
||||
//
|
||||
// protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) {
|
||||
// final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder();
|
||||
// if (legalName != null) {
|
||||
// metadata.setLegalname(sf(legalName));
|
||||
// }
|
||||
// if (legalShortName != null) {
|
||||
// metadata.setLegalshortname(sf(legalShortName));
|
||||
// }
|
||||
//
|
||||
// final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
|
||||
// entity.setOrganization(Organization.newBuilder().setMetadata(metadata));
|
||||
//
|
||||
// return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model());
|
||||
// }
|
||||
//
|
||||
// private StructuredProperty sp(final String pid, final String type) {
|
||||
// final Builder pidSp =
|
||||
// StructuredProperty.newBuilder().setValue(pid)
|
||||
// .setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types"));
|
||||
// return pidSp.build();
|
||||
// }
|
||||
//
|
||||
// protected Field title(final String s) {
|
||||
// return new FieldValueImpl(Type.String, "title", s);
|
||||
// }
|
||||
//
|
||||
// protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) {
|
||||
// return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier);
|
||||
// }
|
||||
//
|
||||
// /*
|
||||
// * protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); }
|
||||
// *
|
||||
// * protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return
|
||||
// * Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); }
|
||||
// */
|
||||
//
|
||||
//}
|
||||
package eu.dnetlib.pace;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
||||
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
||||
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
||||
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder;
|
||||
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
||||
import eu.dnetlib.data.proto.OafProtos.OafEntity;
|
||||
import eu.dnetlib.data.proto.OrganizationProtos.Organization;
|
||||
import eu.dnetlib.data.proto.ResultProtos.Result;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
||||
import eu.dnetlib.pace.model.gt.GTAuthor;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.RandomStringUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.RandomUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public abstract class AbstractProtoPaceTest extends OafTest {
|
||||
|
||||
protected DedupConfig getResultFullConf() {
|
||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
|
||||
}
|
||||
|
||||
protected DedupConfig getResultSimpleConf() {
|
||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf"));
|
||||
}
|
||||
|
||||
protected DedupConfig getResultConf() {
|
||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf"));
|
||||
}
|
||||
|
||||
protected DedupConfig getOrganizationSimpleConf() {
|
||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
||||
}
|
||||
|
||||
|
||||
protected DedupConfig getOrganizationTestConf() {
|
||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.test.conf"));
|
||||
}
|
||||
|
||||
|
||||
protected DedupConfig getResultAuthorsConf() {
|
||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf"));
|
||||
}
|
||||
|
||||
protected DedupConfig getResultProdConf() {
|
||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf"));
|
||||
}
|
||||
|
||||
protected MapDocument author(final Config conf, final String id, final Oaf oaf) {
|
||||
return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model());
|
||||
}
|
||||
|
||||
protected GTAuthor getGTAuthor(final String path) {
|
||||
|
||||
final Gson gson = new Gson();
|
||||
|
||||
final String json = readFromClasspath(path);
|
||||
|
||||
final GTAuthor gta = gson.fromJson(json, GTAuthor.class);
|
||||
|
||||
return gta;
|
||||
}
|
||||
|
||||
protected String readFromClasspath(final String filename) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(getClass().getResourceAsStream(filename), sw);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
|
||||
protected MapDocument result(final Config config, final String id, final String title) {
|
||||
return result(config, id, title, null, new ArrayList<>(), null);
|
||||
}
|
||||
|
||||
protected MapDocument result(final Config config, final String id, final String title, final String date) {
|
||||
return result(config, id, title, date, new ArrayList<>(), null);
|
||||
}
|
||||
|
||||
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid) {
|
||||
return result(config, id, title, date, pid, null);
|
||||
}
|
||||
|
||||
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) {
|
||||
return result(config, id, title, date, pid, null);
|
||||
}
|
||||
|
||||
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List<String> authors) {
|
||||
return result(config, id, title, date, Lists.newArrayList(pid), authors);
|
||||
}
|
||||
|
||||
static List<String> pidTypes = Lists.newArrayList();
|
||||
static {
|
||||
pidTypes.add("doi");
|
||||
//pidTypes.add("oai");
|
||||
//pidTypes.add("pmid");
|
||||
}
|
||||
|
||||
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
|
||||
final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
|
||||
if (!StringUtils.isBlank(title)) {
|
||||
metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles")));
|
||||
metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles")));
|
||||
}
|
||||
if (!StringUtils.isBlank(date)) {
|
||||
metadata.setDateofacceptance(sf(date));
|
||||
}
|
||||
|
||||
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
|
||||
final Result.Builder result = Result.newBuilder().setMetadata(metadata);
|
||||
|
||||
if (authors != null) {
|
||||
result.getMetadataBuilder().addAllAuthor(
|
||||
IntStream.range(0, authors.size())
|
||||
.mapToObj(i -> author(authors.get(i), i))
|
||||
.collect(Collectors.toCollection(LinkedList::new)));
|
||||
}
|
||||
|
||||
entity.setResult(result);
|
||||
|
||||
if (pid != null) {
|
||||
for(String p : pid) {
|
||||
if (!StringUtils.isBlank(p)) {
|
||||
entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1))));
|
||||
//entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final OafEntity build = entity.build();
|
||||
return ProtoDocumentBuilder.newInstance(id, build, config.model());
|
||||
}
|
||||
|
||||
private Author author(final String s, int rank) {
|
||||
final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false);
|
||||
final Author.Builder author = Author.newBuilder();
|
||||
if (p.isAccurate()) {
|
||||
author.setName(p.getNormalisedFirstName());
|
||||
author.setSurname(p.getNormalisedSurname());
|
||||
}
|
||||
author.setFullname(p.getNormalisedFullname());
|
||||
author.setRank(rank);
|
||||
|
||||
return author.build();
|
||||
}
|
||||
|
||||
private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) {
|
||||
final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type);
|
||||
return entity;
|
||||
}
|
||||
|
||||
protected MapDocument organization(final Config config, final String id, final String legalName) {
|
||||
return organization(config, id, legalName, null);
|
||||
}
|
||||
|
||||
protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) {
|
||||
final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder();
|
||||
if (legalName != null) {
|
||||
metadata.setLegalname(sf(legalName));
|
||||
}
|
||||
if (legalShortName != null) {
|
||||
metadata.setLegalshortname(sf(legalShortName));
|
||||
}
|
||||
|
||||
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
|
||||
entity.setOrganization(Organization.newBuilder().setMetadata(metadata));
|
||||
|
||||
return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model());
|
||||
}
|
||||
|
||||
private StructuredProperty sp(final String pid, final String type) {
|
||||
final Builder pidSp =
|
||||
StructuredProperty.newBuilder().setValue(pid)
|
||||
.setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types"));
|
||||
return pidSp.build();
|
||||
}
|
||||
|
||||
protected Field title(final String s) {
|
||||
return new FieldValueImpl(Type.String, "title", s);
|
||||
}
|
||||
|
||||
protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) {
|
||||
return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier);
|
||||
}
|
||||
|
||||
/*
|
||||
* protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); }
|
||||
*
|
||||
* protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return
|
||||
* Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); }
|
||||
*/
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,450 +1,360 @@
|
|||
//package eu.dnetlib.pace.distance;
|
||||
//
|
||||
//import com.google.common.collect.Lists;
|
||||
//import com.google.common.collect.Maps;
|
||||
//import com.google.common.collect.Sets;
|
||||
//import com.googlecode.protobuf.format.JsonFormat;
|
||||
//import eu.dnetlib.data.proto.OafProtos;
|
||||
//import eu.dnetlib.pace.AbstractProtoPaceTest;
|
||||
//import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||
//import eu.dnetlib.pace.config.Config;
|
||||
//import eu.dnetlib.pace.config.DedupConfig;
|
||||
//import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
//import eu.dnetlib.pace.model.MapDocument;
|
||||
//import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
||||
//import org.apache.commons.io.IOUtils;
|
||||
//import org.apache.commons.logging.Log;
|
||||
//import org.apache.commons.logging.LogFactory;
|
||||
//import org.junit.Ignore;
|
||||
//import org.junit.Test;
|
||||
//
|
||||
//import java.io.IOException;
|
||||
//import java.util.List;
|
||||
//import java.util.Map;
|
||||
//import java.util.Set;
|
||||
//import java.util.stream.Collectors;
|
||||
//
|
||||
//import static org.junit.Assert.assertFalse;
|
||||
//import static org.junit.Assert.assertTrue;
|
||||
//
|
||||
//public class DetectorTest extends AbstractProtoPaceTest {
|
||||
//
|
||||
// private static final Log log = LogFactory.getLog(DetectorTest.class);
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultSimple() {
|
||||
// final Config config = getResultSimpleConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Recent results from CDF");
|
||||
// final MapDocument resB = result(config, "B", "Recent results from CDF");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// final double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue(d == 1.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultSimpleMissingDates() {
|
||||
// final Config config = getResultSimpleConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Recent results from BES");
|
||||
// final MapDocument resB = result(config, "A", "Recent results from CES");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// final double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue(d > 0.97);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultInvalidDate() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05");
|
||||
// final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// final double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue(d == 1.0);
|
||||
// }
|
||||
//
|
||||
// @Ignore
|
||||
// @Test
|
||||
// public void testDistanceResultMissingOneDate() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "title title title 6BESR", null);
|
||||
// final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue((d > 0.9) && (d < 1.0));
|
||||
// }
|
||||
//
|
||||
// @Ignore
|
||||
// @Test
|
||||
// public void testDistanceResult() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "title title title BES", "");
|
||||
// final MapDocument resB = result(config, "B", "title title title CLEO");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue((d > 0.9) && (d < 1.0));
|
||||
// }
|
||||
//
|
||||
// @Ignore
|
||||
// @Test
|
||||
// public void testDistanceResultMissingTwoDate() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "bellaciao");
|
||||
// final MapDocument resB = result(config, "B", "bellocioa");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue((d > 0.9) && (d < 1.0));
|
||||
// }
|
||||
//
|
||||
// @Ignore
|
||||
// @Test
|
||||
// public void testDistanceOrganizationIgnoreMissing() {
|
||||
//
|
||||
// final Config config = getOrganizationSimpleConf();
|
||||
//
|
||||
// final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE");
|
||||
// final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
|
||||
// final double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue(d > 0.99);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCase1() {
|
||||
//
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003");
|
||||
// final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue((d > 0.9) && (d < 1.0));
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseDoiMatch1() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855");
|
||||
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue("exact DOIs will produce an exact match", d == 1.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseDoiMatch2() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855");
|
||||
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseDoiMatch3() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
||||
// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseDoiMatch4() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
||||
// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseDoiMatch5() {
|
||||
//
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020");
|
||||
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0));
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseDoiMatch6() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
||||
// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseDoiMatch7() {
|
||||
// final Config config = getResultConf();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds"));
|
||||
// final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1);
|
||||
// }
|
||||
//
|
||||
// // http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseAuthor1() {
|
||||
//
|
||||
// final Config config = getResultAuthorsConf();
|
||||
//
|
||||
// final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d");
|
||||
// final List<String> authorsB = Lists.newArrayList("a", "b", "c");
|
||||
// final List<String> pid = Lists.newArrayList();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// final double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue(d == 0.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseAuthor2() {
|
||||
//
|
||||
// final Config config = getResultAuthorsConf();
|
||||
//
|
||||
// final List<String> authorsA = Lists.newArrayList("a", "b", "c");
|
||||
// final List<String> authorsB = Lists.newArrayList("a", "b", "c");
|
||||
// final List<String> pid = Lists.newArrayList();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// final double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue(d == 1.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseAuthor3() {
|
||||
//
|
||||
// final Config config = getResultAuthorsConf();
|
||||
//
|
||||
// final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M.");
|
||||
// final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
|
||||
// final List<String> pid = Lists.newArrayList();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// assertTrue((d > 0.9) && (d < 1.0));
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultCaseAuthor4() {
|
||||
//
|
||||
// final Config config = getResultAuthorsConf();
|
||||
//
|
||||
// final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a");
|
||||
// final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
|
||||
// final List<String> pid = Lists.newArrayList();
|
||||
//
|
||||
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// final double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// // assertTrue(d.getScore() == 0.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultNoPidsConf() {
|
||||
//
|
||||
// final Config config = getResultFullConf();
|
||||
//
|
||||
// final MapDocument resA =
|
||||
// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010");
|
||||
//
|
||||
// final MapDocument resB =
|
||||
// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010");
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// final double s = sr.getScore();
|
||||
//
|
||||
// log.info(sr.toString());
|
||||
// log.info(String.format(" s ---> %s", s));
|
||||
// // assertTrue(d.getScore() == 0.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultPidsConf() {
|
||||
//
|
||||
// final Config config = getResultFullConf();
|
||||
//
|
||||
// final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
|
||||
// final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
|
||||
//
|
||||
// final List<String> pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b");
|
||||
// final MapDocument resA =
|
||||
// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||
// pidA, authorsA);
|
||||
//
|
||||
// final List<String> pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d");
|
||||
// final MapDocument resB =
|
||||
// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010",
|
||||
// pidB, authorsB);
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// final double s = sr.getScore();
|
||||
// log.info(sr.toString());
|
||||
// log.info(String.format(" s ---> %s", s));
|
||||
//
|
||||
// // assertTrue(d.getScore() == 0.0);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testDistanceResultFullConf() {
|
||||
//
|
||||
// final Config config = getResultFullConf();
|
||||
//
|
||||
// final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
|
||||
// final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
|
||||
//
|
||||
// final MapDocument resA =
|
||||
// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||
// "10.1186/1752-1947-4-299", authorsA);
|
||||
//
|
||||
// final MapDocument resB =
|
||||
// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||
// "10.1186/1752-1947-4-299", authorsB);
|
||||
//
|
||||
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
// final double d = sr.getScore();
|
||||
// log.info(String.format(" d ---> %s", d));
|
||||
//
|
||||
// // assertTrue(d.getScore() == 0.0);
|
||||
// }
|
||||
//
|
||||
// @Ignore
|
||||
// @Test
|
||||
// public void testDistance() throws IOException {
|
||||
//
|
||||
// final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json"));
|
||||
//
|
||||
// final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json");
|
||||
// final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json");
|
||||
//
|
||||
// final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf);
|
||||
//
|
||||
// log.info("score = " + result);
|
||||
//
|
||||
// }
|
||||
//
|
||||
// @Ignore
|
||||
// @Test
|
||||
// public void testDistanceOrgs() throws IOException {
|
||||
//
|
||||
// final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
||||
//
|
||||
// final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json"));
|
||||
// final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json"));
|
||||
//
|
||||
// Set<String> keysA = getGroupingKeys(conf, orgA);
|
||||
// Set<String> keysB = getGroupingKeys(conf, orgB);
|
||||
//
|
||||
// assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty());
|
||||
//
|
||||
// log.info("clustering keys A = " + getGroupingKeys(conf, orgA));
|
||||
// log.info("clustering keys B = " + getGroupingKeys(conf, orgB));
|
||||
//
|
||||
// final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf);
|
||||
//
|
||||
// log.info("score = " + result);
|
||||
// log.info("distance = " + result.getScore());
|
||||
// }
|
||||
//
|
||||
// private Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
|
||||
// return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
||||
// }
|
||||
//
|
||||
// private MapDocument asMapDocument(DedupConfig conf, final String json) {
|
||||
// OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder();
|
||||
// try {
|
||||
// JsonFormat.merge(json, b);
|
||||
// } catch (JsonFormat.ParseException e) {
|
||||
// throw new IllegalArgumentException(e);
|
||||
// }
|
||||
// return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel());
|
||||
// }
|
||||
//
|
||||
//
|
||||
//}
|
||||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.googlecode.protobuf.format.JsonFormat;
|
||||
import eu.dnetlib.data.proto.OafProtos;
|
||||
import eu.dnetlib.pace.AbstractProtoPaceTest;
|
||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class DetectorTest extends AbstractProtoPaceTest {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DetectorTest.class);
|
||||
|
||||
@Test
|
||||
public void testDistanceResultSimple() {
|
||||
final Config config = getResultSimpleConf();
|
||||
final MapDocument resA = result(config, "A", "Recent results from CDF");
|
||||
final MapDocument resB = result(config, "B", "Recent results from CDF");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue(d == 1.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultSimpleMissingDates() {
|
||||
final Config config = getResultSimpleConf();
|
||||
final MapDocument resA = result(config, "A", "Recent results from BES");
|
||||
final MapDocument resB = result(config, "A", "Recent results from CES");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue(d > 0.97);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultInvalidDate() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05");
|
||||
final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue(d == 1.0);
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testDistanceResultMissingOneDate() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "title title title 6BESR", null);
|
||||
final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue((d > 0.9) && (d < 1.0));
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testDistanceResult() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "title title title BES", "");
|
||||
final MapDocument resB = result(config, "B", "title title title CLEO");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue((d > 0.9) && (d < 1.0));
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testDistanceResultMissingTwoDate() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "bellaciao");
|
||||
final MapDocument resB = result(config, "B", "bellocioa");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue((d > 0.9) && (d < 1.0));
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testDistanceOrganizationIgnoreMissing() {
|
||||
final Config config = getOrganizationSimpleConf();
|
||||
final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE");
|
||||
final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
|
||||
final double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue(d > 0.99);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDistanceOrganizations() {
|
||||
final Config config = getOrganizationTestConf();
|
||||
final MapDocument orgA = organization(config, "A", "UNIVERSITA DEGLI STUDI DI VERONA");
|
||||
final MapDocument orgB = organization(config, "B", "UNIVERSITY OF GENOVA");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
|
||||
final double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCase1() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003");
|
||||
final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue((d > 0.9) && (d < 1.0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseDoiMatch1() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855");
|
||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue("exact DOIs will produce an exact match", d == 1.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseDoiMatch2() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855");
|
||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseDoiMatch3() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
||||
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseDoiMatch4() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
||||
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseDoiMatch5() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020");
|
||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseDoiMatch6() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
||||
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseDoiMatch7() {
|
||||
final Config config = getResultConf();
|
||||
final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds"));
|
||||
final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1);
|
||||
}
|
||||
|
||||
// http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855
|
||||
@Test
|
||||
public void testDistanceResultCaseAuthor1() {
|
||||
final Config config = getResultAuthorsConf();
|
||||
final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d");
|
||||
final List<String> authorsB = Lists.newArrayList("a", "b", "c");
|
||||
final List<String> pid = Lists.newArrayList();
|
||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue(d == 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseAuthor2() {
|
||||
final Config config = getResultAuthorsConf();
|
||||
final List<String> authorsA = Lists.newArrayList("a", "b", "c");
|
||||
final List<String> authorsB = Lists.newArrayList("a", "b", "c");
|
||||
final List<String> pid = Lists.newArrayList();
|
||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue(d == 1.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseAuthor3() {
|
||||
final Config config = getResultAuthorsConf();
|
||||
final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M.");
|
||||
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
|
||||
final List<String> pid = Lists.newArrayList();
|
||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
assertTrue((d > 0.9) && (d < 1.0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultCaseAuthor4() {
|
||||
final Config config = getResultAuthorsConf();
|
||||
final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a");
|
||||
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
|
||||
final List<String> pid = Lists.newArrayList();
|
||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
// assertTrue(d.getScore() == 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultNoPidsConf() {
|
||||
final Config config = getResultFullConf();
|
||||
final MapDocument resA =
|
||||
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010");
|
||||
final MapDocument resB =
|
||||
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010");
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double s = sr.getScore();
|
||||
log.info(sr.toString());
|
||||
log.info(String.format(" s ---> %s", s));
|
||||
// assertTrue(d.getScore() == 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultPidsConf() {
|
||||
final Config config = getResultFullConf();
|
||||
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
|
||||
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
|
||||
final List<String> pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b");
|
||||
final MapDocument resA =
|
||||
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||
pidA, authorsA);
|
||||
final List<String> pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d");
|
||||
final MapDocument resB =
|
||||
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010",
|
||||
pidB, authorsB);
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double s = sr.getScore();
|
||||
log.info(sr.toString());
|
||||
log.info(String.format(" s ---> %s", s));
|
||||
// assertTrue(d.getScore() == 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultFullConf() {
|
||||
final Config config = getResultFullConf();
|
||||
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
|
||||
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
|
||||
final MapDocument resA =
|
||||
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||
"10.1186/1752-1947-4-299", authorsA);
|
||||
final MapDocument resB =
|
||||
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||
"10.1186/1752-1947-4-299", authorsB);
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double d = sr.getScore();
|
||||
log.info(String.format(" d ---> %s", d));
|
||||
// assertTrue(d.getScore() == 0.0);
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testDistance() throws IOException {
|
||||
final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json"));
|
||||
final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json");
|
||||
final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json");
|
||||
final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf);
|
||||
log.info("score = " + result);
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testDistanceOrgs() throws IOException {
|
||||
final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
||||
final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json"));
|
||||
final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json"));
|
||||
Set<String> keysA = getGroupingKeys(conf, orgA);
|
||||
Set<String> keysB = getGroupingKeys(conf, orgB);
|
||||
assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty());
|
||||
log.info("clustering keys A = " + getGroupingKeys(conf, orgA));
|
||||
log.info("clustering keys B = " + getGroupingKeys(conf, orgB));
|
||||
final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf);
|
||||
log.info("score = " + result);
|
||||
log.info("distance = " + result.getScore());
|
||||
}
|
||||
|
||||
private Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
|
||||
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
||||
}
|
||||
|
||||
private MapDocument asMapDocument(DedupConfig conf, final String json) {
|
||||
OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder();
|
||||
try {
|
||||
JsonFormat.merge(json, b);
|
||||
} catch (JsonFormat.ParseException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.98",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "10",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }
|
||||
],
|
||||
"conditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }
|
||||
],
|
||||
"blacklists" : { }
|
||||
}
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.99",
|
||||
"run" : "001",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "result",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue