tests and relative resources migrated from openaire-mapping-utils

This commit is contained in:
Claudio Atzori 2018-10-18 15:30:51 +02:00
parent 8cc925f017
commit 0bab8cf704
20 changed files with 2194 additions and 8 deletions

View File

@ -59,6 +59,19 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -1,24 +1,21 @@
package eu.dnetlib.graph
import java.lang
package eu.dnetlib.graph
import eu.dnetlib.ConnectedComponent
import eu.dnetlib.pace.model.MapDocument
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import scala.collection.JavaConversions
;
import scala.collection.JavaConversions;
object GraphProcessor {
def findCCs(vertexes: RDD[(VertexId,MapDocument)], edges:RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
def findCCs(vertexes: RDD[(VertexId, MapDocument)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
val graph: Graph[MapDocument, String] = Graph(vertexes, edges)
val cc = graph.connectedComponents(maxIterations).vertices
val joinResult = vertexes.leftOuterJoin(cc).map {
case (id, (openaireId, cc)) => {
if (cc.isEmpty){
if (cc.isEmpty) {
(id, openaireId)
}
else {
@ -33,7 +30,7 @@ object GraphProcessor {
}
def asConnectedComponent(group: (VertexId, Iterable[MapDocument])) : ConnectedComponent = {
def asConnectedComponent(group: (VertexId, Iterable[MapDocument])): ConnectedComponent = {
val docs = group._2.toSet[MapDocument]
val connectedComponent = new ConnectedComponent("empty", JavaConversions.setAsJavaSet[MapDocument](docs));
connectedComponent.initializeID();

View File

@ -0,0 +1,198 @@
package eu.dnetlib.pace;
import com.google.common.collect.Lists;
import com.google.gson.Gson;
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder;
import eu.dnetlib.data.proto.OafProtos.Oaf;
import eu.dnetlib.data.proto.OafProtos.OafEntity;
import eu.dnetlib.data.proto.OrganizationProtos.Organization;
import eu.dnetlib.data.proto.ResultProtos.Result;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import eu.dnetlib.pace.model.gt.GTAuthor;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.StringUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public abstract class AbstractProtoPaceTest extends OafTest {
protected DedupConfig getResultFullConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
}
protected DedupConfig getResultSimpleConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf"));
}
protected DedupConfig getResultConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf"));
}
protected DedupConfig getOrganizationSimpleConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
}
protected DedupConfig getResultAuthorsConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf"));
}
protected DedupConfig getResultProdConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf"));
}
protected MapDocument author(final Config conf, final String id, final Oaf oaf) {
return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model());
}
protected GTAuthor getGTAuthor(final String path) {
final Gson gson = new Gson();
final String json = readFromClasspath(path);
final GTAuthor gta = gson.fromJson(json, GTAuthor.class);
return gta;
}
protected String readFromClasspath(final String filename) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(getClass().getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
protected MapDocument result(final Config config, final String id, final String title) {
return result(config, id, title, null, new ArrayList<>(), null);
}
protected MapDocument result(final Config config, final String id, final String title, final String date) {
return result(config, id, title, date, new ArrayList<>(), null);
}
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid) {
return result(config, id, title, date, pid, null);
}
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) {
return result(config, id, title, date, pid, null);
}
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List<String> authors) {
return result(config, id, title, date, Lists.newArrayList(pid), authors);
}
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
if (!StringUtils.isBlank(title)) {
metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles")));
metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles")));
}
if (!StringUtils.isBlank(date)) {
metadata.setDateofacceptance(sf(date));
}
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
final Result.Builder result = Result.newBuilder().setMetadata(metadata);
if (authors != null) {
result.getMetadataBuilder().addAllAuthor(
IntStream.range(0, authors.size())
.mapToObj(i -> author(authors.get(i), i))
.collect(Collectors.toCollection(LinkedList::new)));
}
entity.setResult(result);
if (pid != null) {
for(String p : pid) {
if (!StringUtils.isBlank(p)) {
entity.addPid(sp(p, "doi"));
//entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
}
}
}
final OafEntity build = entity.build();
return ProtoDocumentBuilder.newInstance(id, build, config.model());
}
private Author author(final String s, int rank) {
final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false);
final Author.Builder author = Author.newBuilder();
if (p.isAccurate()) {
author.setName(p.getNormalisedFirstName());
author.setSurname(p.getNormalisedSurname());
}
author.setFullname(p.getNormalisedFullname());
author.setRank(rank);
return author.build();
}
private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) {
final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type);
return entity;
}
protected MapDocument organization(final Config config, final String id, final String legalName) {
return organization(config, id, legalName, null);
}
protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) {
final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder();
if (legalName != null) {
metadata.setLegalname(sf(legalName));
}
if (legalShortName != null) {
metadata.setLegalshortname(sf(legalShortName));
}
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
entity.setOrganization(Organization.newBuilder().setMetadata(metadata));
return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model());
}
private StructuredProperty sp(final String pid, final String type) {
final Builder pidSp =
StructuredProperty.newBuilder().setValue(pid)
.setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types"));
return pidSp.build();
}
protected Field title(final String s) {
return new FieldValueImpl(Type.String, "title", s);
}
protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) {
return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier);
}
/*
* protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); }
*
* protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return
* Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); }
*/
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,42 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Before;
import org.junit.Test;
public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest {
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class);
private Config config;
@Before
public void setUp() {
config = getResultFullConf();
}
@Test
public void testCombine() {
final MapDocument result =
result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013");
final FieldListImpl fl = new FieldListImpl();
fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline"));
result.getFieldMap().put("desc", fl);
fl.clear();
fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty"));
final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title");
field.add(fl);
log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config));
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.clustering.ClusteringCombiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Before;
import org.junit.Test;
public class ClusteringCombinerTest extends AbstractProtoPaceTest {
private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class);
private Config config;
@Before
public void setUp() {
config = getResultFullConf();
}
@Test
public void testCombine() {
String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission";
MapDocument result = result(config, "A", title, "2013");
FieldListImpl fl = new FieldListImpl();
fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty"));
result.getFieldMap().put("desc", fl);
log.info(title);
log.info(ClusteringCombiner.combine(result, config));
}
}

View File

@ -0,0 +1,405 @@
package eu.dnetlib.pace.distance;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.distance.eval.ScoreResult;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Ignore;
import org.junit.Test;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
public class DetectorTest extends AbstractProtoPaceTest {
private static final Log log = LogFactory.getLog(DetectorTest.class);
@Test
public void testDistanceResultSimple() {
final Config config = getResultSimpleConf();
final MapDocument resA = result(config, "A", "Recent results from CDF");
final MapDocument resB = result(config, "B", "Recent results from CDF");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue(d == 1.0);
}
@Test
public void testDistanceResultSimpleMissingDates() {
final Config config = getResultSimpleConf();
final MapDocument resA = result(config, "A", "Recent results from BES");
final MapDocument resB = result(config, "A", "Recent results from CES");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue(d > 0.97);
}
@Test
public void testDistanceResultInvalidDate() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05");
final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue(d == 1.0);
}
@Ignore
@Test
public void testDistanceResultMissingOneDate() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "title title title 6BESR", null);
final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue((d > 0.9) && (d < 1.0));
}
@Ignore
@Test
public void testDistanceResult() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "title title title BES", "");
final MapDocument resB = result(config, "B", "title title title CLEO");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue((d > 0.9) && (d < 1.0));
}
@Ignore
@Test
public void testDistanceResultMissingTwoDate() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "title title title 6BESR");
final MapDocument resB = result(config, "B", "title title title 6CLER");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue((d > 0.9) && (d < 1.0));
}
@Ignore
@Test
public void testDistanceOrganizationIgnoreMissing() {
final Config config = getOrganizationSimpleConf();
final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE");
final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR");
final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
final double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue(d > 0.99);
}
@Test
public void testDistanceResultCase1() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003");
final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue((d > 0.9) && (d < 1.0));
}
@Test
public void testDistanceResultCaseDoiMatch1() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855");
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue("exact DOIs will produce an exact match", d == 1.0);
}
@Test
public void testDistanceResultCaseDoiMatch2() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855");
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0);
}
@Test
public void testDistanceResultCaseDoiMatch3() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0);
}
@Test
public void testDistanceResultCaseDoiMatch4() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0);
}
@Test
public void testDistanceResultCaseDoiMatch5() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020");
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0));
}
@Test
public void testDistanceResultCaseDoiMatch6() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0);
}
@Test
public void testDistanceResultCaseDoiMatch7() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds"));
final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1);
}
// http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855
@Test
public void testDistanceResultCaseAuthor1() {
final Config config = getResultAuthorsConf();
final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d");
final List<String> authorsB = Lists.newArrayList("a", "b", "c");
final List<String> pid = Lists.newArrayList();
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue(d == 0.0);
}
@Test
public void testDistanceResultCaseAuthor2() {
final Config config = getResultAuthorsConf();
final List<String> authorsA = Lists.newArrayList("a", "b", "c");
final List<String> authorsB = Lists.newArrayList("a", "b", "c");
final List<String> pid = Lists.newArrayList();
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue(d == 1.0);
}
@Test
public void testDistanceResultCaseAuthor3() {
final Config config = getResultAuthorsConf();
final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M.");
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
final List<String> pid = Lists.newArrayList();
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
assertTrue((d > 0.9) && (d < 1.0));
}
@Test
public void testDistanceResultCaseAuthor4() {
final Config config = getResultAuthorsConf();
final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a");
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
final List<String> pid = Lists.newArrayList();
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
// assertTrue(d.getScore() == 0.0);
}
@Test
public void testDistanceResultFullConf() {
final Config config = getResultFullConf();
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
final MapDocument resA =
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
"10.1186/1752-1947-4-299", authorsA);
final MapDocument resB =
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
"10.1186/1752-1947-4-299", authorsB);
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore();
log.info(String.format(" d ---> %s", d));
// assertTrue(d.getScore() == 0.0);
}
@Ignore
@Test
public void testDistance() throws IOException {
final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json"));
final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json");
final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json");
final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf);
log.info("score = " + result);
}
@Ignore
@Test
public void testDistanceOrgs() throws IOException {
final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json"));
final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json"));
Set<String> keysA = getGroupingKeys(conf, orgA);
Set<String> keysB = getGroupingKeys(conf, orgB);
assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty());
log.info("clustering keys A = " + getGroupingKeys(conf, orgA));
log.info("clustering keys B = " + getGroupingKeys(conf, orgB));
final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf);
log.info("score = " + result);
log.info("distance = " + result.getScore());
}
private Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
}
private MapDocument asMapDocument(DedupConfig conf, final String json) {
OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder();
try {
JsonFormat.merge(json, b);
} catch (JsonFormat.ParseException e) {
throw new IllegalArgumentException(e);
}
return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel());
}
}

View File

@ -0,0 +1,50 @@
package eu.dnetlib.pace.model;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView;
import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.DetectorTest;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentSerializer;
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Test;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest {
private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class);
@Test
public void test_serialise1() {
final String id = "12345";
final Config config = getResultFullConf();
final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model());
assertFalse(document.fieldNames().isEmpty());
assertFalse(Iterables.isEmpty(document.fields()));
log.info("original:\n" + document);
final String stringDoc = MapDocumentSerializer.toString(document);
log.info("srialization:\n" + stringDoc);
final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes());
final SetView<String> diff = Sets.difference(document.fieldNames(), decoded.fieldNames());
assertTrue(diff.isEmpty());
log.info("decoded:\n" + decoded);
}
}

View File

@ -0,0 +1,121 @@
{
"dateoftransformation": "2018-08-07T06:48:42.668Z",
"originalId": [
"oai:rua.ua.es:10045/34236"
],
"oaiprovenance": {
"originDescription": {
"metadataNamespace": "http://www.openarchives.org/OAI/2.0/oai_dc/",
"altered": true,
"baseURL": "http://rua.ua.es/dspace-oai/request",
"datestamp": "2016-04-28T11:28:35Z",
"harvestDate": "2018-06-14T13:53:42.185Z",
"identifier": "oai:rua.ua.es:10045/34236"
}
},
"result": {
"instance": [
{
"hostedby": {
"value": "Repositorio Institucional de la Universidad de Alicante",
"key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0"
},
"url": [
"http://hdl.handle.net/10045/34236"
],
"dateofacceptance": {
"value": "2013-11-27"
},
"collectedfrom": {
"value": "Repositorio Institucional de la Universidad de Alicante",
"key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemename": "dnet:access_modes",
"schemeid": "dnet:access_modes"
},
"instancetype": {
"classid": "0010",
"classname": "Lecture",
"schemename": "dnet:publication_resource",
"schemeid": "dnet:publication_resource"
}
}
],
"metadata": {
"language": {
"classid": "eng",
"classname": "English",
"schemename": "dnet:languages",
"schemeid": "dnet:languages"
},
"title": [
{
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemename": "dnet:dataCite_title",
"schemeid": "dnet:dataCite_title"
},
"value": "Henry James (1843-1916)"
}
],
"journal": {
"name": ""
},
"author": [
{
"fullname": "Gómez Reus, Teresa",
"surname": "Gómez Reus",
"name": "Teresa",
"rank": 1
}
],
"resulttype": {
"classid": "other",
"classname": "other",
"schemename": "dnet:result_typologies",
"schemeid": "dnet:result_typologies"
},
"dateofacceptance": {
"value": "2013-11-27"
},
"contributor": [
{
"value": "Universidad de Alicante. Departamento de Filología Inglesa"
}
],
"subject": [
{
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"schemename": "dnet:result_subject",
"schemeid": "dnet:result_subject"
},
"value": "James, Henry"
},
{
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"schemename": "dnet:result_subject",
"schemeid": "dnet:result_subject"
},
"value": "Filología Inglesa"
}
]
}
},
"collectedfrom": [
{
"value": "Repositorio Institucional de la Universidad de Alicante",
"key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0"
}
],
"dateofcollection": "2018-06-14T13:53:42.185Z",
"type": 50,
"id": "50|od_______935::2b908ad38030168759c568f49af50784"
}

View File

@ -0,0 +1,78 @@
{
"pid": [
{
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemename": "dnet:pid_types",
"schemeid": "dnet:pid_types"
},
"value": "10.1002/9781444393675.ch6"
}
],
"result": {
"instance": [
{
"url": [
"http://dx.doi.org/10.1002/9781444393675.ch6"
],
"collectedfrom": {
"value": "CrossRef",
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"
},
"hostedby": {
"value": "Unknown Repository",
"key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c"
},
"accessright": {
"classid": "CLOSED",
"classname": "Closed Access",
"schemename": "dnet:access_modes",
"schemeid": "dnet:access_modes"
},
"instancetype": {
"classid": "0013",
"classname": "Part of book or chapter of book",
"schemename": "dnet:publication_resource",
"schemeid": "dnet:publication_resource"
}
}
],
"metadata": {
"title": [
{
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemename": "dnet:dataCite_title",
"schemeid": "dnet:dataCite_title"
},
"value": "Henry James (1843-1916)"
}
],
"resulttype": {
"classid": "publication",
"classname": "publication",
"schemename": "dnet:result_typologies",
"schemeid": "dnet:result_typologies"
}
}
},
"collectedfrom": [
{
"value": "Microsoft Academic Graph",
"key": "10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"
},
{
"value": "CrossRef",
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"
},
{
"value": "UnpayWall",
"key": "10|openaire____::8ac8380272269217cb09a928c8caa993"
}
],
"dateofcollection": "2018-08-07 12:24:48Z",
"type": 50,
"id": "50|crossref____::0000002a9885b7ec89b7b9d8ff3331a0"
}

View File

@ -0,0 +1,34 @@
{
"wf" : {
"threshold" : "0.85",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "20000",
"groupMaxSize" : "20",
"slidingWindowSize" : "400",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "immutablefieldvalue", "fields" : [ "country" ], "params" : { } },
{ "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
],
"conditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] },
{ "name" : "mustBeDifferent", "fields" : [ "gridid" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
{ "name" : "legalshortname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "websiteurl", "algo" : "urlMatcher", "type" : "URL", "weight" : "0.6", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],
"blacklists" : { }
}
}

View File

@ -0,0 +1,34 @@
{
"dateoftransformation": "2018-06-04",
"originalId": [
"opendoar____::Institute_of_Information_Science_and_Technology_&quot;A._Faedo&quot;"
],
"collectedfrom": [
{
"value": "OpenDOAR",
"key": "10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"
}
],
"organization": {
"metadata": {
"legalshortname": {
"value": "CNR-ISTI"
},
"websiteurl": {
"value": "http://www.isti.cnr.it/aaaaa"
},
"country": {
"classid": "IT",
"classname": "IT",
"schemename": "dnet:countries",
"schemeid": "dnet:countries"
},
"legalname": {
"value": "Institute of Information Science and Technology &quot;A. Faedo&quot;"
}
}
},
"dateofcollection": "2015-08-24",
"type": 20,
"id": "20|opendoar____::68d8b122736484cb07f75885af22e82f"
}

View File

@ -0,0 +1,48 @@
{
"collectedfrom": [
{
"value": "GRID - Global Research Identifier Database",
"key": "10|openaire____::ff4a008470319a22d9cf3d14af485977"
}
],
"organization": {
"metadata": {
"legalshortname": {
"value": "ISTI"
},
"websiteurl": {
"value": "http://www.isti.cnr.it/aaaaaa"
},
"country": {
"classid": "IT",
"classname": "Italy",
"schemename": "dnet:countries",
"schemeid": "dnet:countries"
},
"alternativeNames": [
{
"value": "Istituto di Scienza e Tecnologie dell'Informazione \"A. Faedo\""
},
{
"value": "ISTI"
}
],
"legalname": {
"value": "CNR - Institute of Information Science and Technologies"
}
}
},
"pid": [
{
"qualifier": {
"classid": "grid",
"classname": "grid",
"schemename": "dnet:pid_types",
"schemeid": "dnet:pid_types"
},
"value": "grid.451498.5"
}
],
"type": 20,
"id": "20|grid________::e4095563f4e9d34dff7d47fb98af042f"
}

View File

@ -0,0 +1,25 @@
{
"wf" : {
"threshold" : "0.99",
"run" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"conditions" : [
{ "name" : "sizeMatch", "fields" : [ "authors" ] },
{ "name" : "titleVersionMatch", "fields" : [ "title" ] }
],
"model" : [
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "authors", "algo" : "SortedLevel2JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
],
"blacklists" : { }
}
}

View File

@ -0,0 +1,51 @@
{
"wf" : {
"threshold" : "0.99",
"run" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"conditions" : [
{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
{ "name" : "sizeMatch", "fields" : [ "authors" ] } ,
{ "name" : "pidMatch", "fields" : [ "pid" ] }
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
],
"blacklists" : {
"title" : [
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*"
] }
}
}

View File

@ -0,0 +1,29 @@
{
"wf" : {
"threshold" : "0.99",
"run" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"strictConditions" : [
{ "name" : "pidMatch", "fields" : [ "pid" ] }
],
"conditions" : [
{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fields" : [ "title" ] }
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" }
],
"blacklists" : { }
}
}

View File

@ -0,0 +1,273 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "4000",
"groupMaxSize" : "40",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
"includeChildren" : "true",
"maxChildren" : "40"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"strictConditions" : [
{ "name" : "pidMatch", "fields" : [ "pid" ] }
],
"conditions" : [
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
{ "name" : "sizeMatch", "fields" : [ "authors" ] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" }
],
"blacklists" : {
"title" : [
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\!?\:?$",
"^Chronic fatigue syndrome\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\W*Cloud Computing\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\.?$",
"(?i)^.*authors[']? response\.?$"
]
}
}
}

View File

@ -0,0 +1,275 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "4000",
"groupMaxSize" : "40",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
"includeChildren" : "true",
"maxChildren" : "40"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"strictConditions" : [
{ "name" : "pidMatch", "fields" : [ "pid" ] }
],
"conditions" : [
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
{ "name" : "sizeMatch", "fields" : [ "authors" ] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
],
"blacklists" : {
"title" : [
"^Inside Front Cover$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\!?\:?$",
"^Chronic fatigue syndrome\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\W*Cloud Computing\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\.?$",
"(?i)^.*authors[']? response\.?$"
]
}
}
}

View File

@ -0,0 +1,21 @@
{
"wf" : {
"threshold" : "0.99",
"run" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"conditions" : [ ],
"model" : [
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }
],
"blacklists" : { }
}
}

View File

@ -109,6 +109,13 @@
<artifactId>dnet-openaire-data-protos</artifactId>
<version>3.9.3-proto250</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
<version>6.2.17-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>