merged JqMapping branch into tree2

This commit is contained in:
Sandro La Bruzzo 2019-12-13 11:30:02 +01:00
commit d09193a094
30 changed files with 370 additions and 1627 deletions

58
.gitignore vendored
View File

@ -1,43 +1,21 @@
*~
# Compiled class file
*.class
# Log file
*.log
# BlueJ files
*.ctxt
# Mobile Tools for Java (J2ME)
.mtj.tmp/
*target
# Package Files #
*.jar
*.war
*.nar
*.ear
*.zip
*.tar.gz
*.rar
*.idea
*.iml
.DS_Store
**/.DS_Store
.project
.idea
*.iml
*~
.classpath
/*/.classpath
/*/*/.classpath
.metadata
/*/.metadata
/*/*/.metadata
.project
.log
.settings
**/.project
**/.classpath
**/.settings
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
/*/*/target
/*/target
/target
/*/*/build
/*/build
/build
spark-warehouse
/dhp-workflows/dhp-graph-mapper/job-override.properties

View File

@ -15,32 +15,31 @@
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
<!-- <plugin>-->
<!-- <groupId>org.apache.maven.plugins</groupId>-->
<!-- <artifactId>maven-shade-plugin</artifactId>-->
<!-- <version>2.4.3</version>-->
<!-- <executions>-->
<!-- <execution>-->
<!-- <phase>package</phase>-->
<!-- <goals>-->
<!-- <goal>shade</goal>-->
<!-- </goals>-->
<!-- <configuration>-->
<!-- <filters>-->
<!-- <filter>-->
<!-- <artifact>*:*</artifact>-->
<!-- <excludes>-->
<!-- <exclude>META-INF/*.SF</exclude>-->
<!-- <exclude>META-INF/*.DSA</exclude>-->
<!-- <exclude>META-INF/*.RSA</exclude>-->
<!-- </excludes>-->
<!-- </filter>-->
<!-- </filters>-->
<!-- </configuration>-->
<!-- </execution>-->
<!-- </executions>-->
<!-- </plugin>-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
@ -114,10 +113,6 @@
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-data-protos</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
@ -133,12 +128,6 @@
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
@ -150,12 +139,6 @@
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>org.apache.oozie</groupId>
<artifactId>oozie-client</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>

View File

@ -4,7 +4,7 @@ import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.utils.PaceUtils;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.reporter.SparkReporter;
import eu.dnetlib.support.ConnectedComponent;
@ -99,7 +99,7 @@ public class Deduper implements Serializable {
*/
public static JavaPairRDD<String, MapDocument> mapToVertexes(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config){
return entities.mapToPair(it -> {
MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, it);
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
});
}

View File

@ -1,92 +0,0 @@
package eu.dnetlib.pace.utils;
import com.google.common.collect.Lists;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.data.proto.ResultProtos;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static eu.dnetlib.proto.utils.OAFProtoUtils.*;
import static eu.dnetlib.proto.utils.OAFProtoUtils.author;
import static eu.dnetlib.proto.utils.OAFProtoUtils.sp;
public class PaceUtils implements Serializable {
public static MapDocument result(final Config config, final String id, final String title) {
return result(config, id, title, null, new ArrayList<>(), null);
}
public static MapDocument result(final Config config, final String id, final String title, final String date) {
return result(config, id, title, date, new ArrayList<>(), null);
}
public static MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid) {
return result(config, id, title, date, pid, null);
}
public static MapDocument result(final Config config, final String id, final String title, final String date, final String pid) {
return result(config, id, title, date, pid, null);
}
public static MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List<String> authors) {
return result(config, id, title, date, Lists.newArrayList(pid), authors);
}
public static MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
if (!StringUtils.isBlank(title)) {
metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles")));
metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles")));
}
if (!StringUtils.isBlank(date)) {
metadata.setDateofacceptance(sf(date));
}
final OafProtos.OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder().setMetadata(metadata);
if (authors != null) {
result.getMetadataBuilder().addAllAuthor(
IntStream.range(0, authors.size())
.mapToObj(i -> author(authors.get(i), i))
.collect(Collectors.toCollection(LinkedList::new)));
}
entity.setResult(result);
if (pid != null) {
for (String p : pid) {
if (!StringUtils.isBlank(p)) {
entity.addPid(sp(p, "doi"));
//entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
}
}
}
final OafProtos.OafEntity build = entity.build();
return ProtoDocumentBuilder.newInstance(id, build, config.model());
}
public static MapDocument asMapDocument(DedupConfig conf, final String json) {
OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder();
try {
JsonFormat.merge(json, b);
} catch (JsonFormat.ParseException e) {
System.out.println("**************************** " + json);
throw new IllegalArgumentException(e);
}
return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel());
}
}

View File

@ -1,43 +0,0 @@
package eu.dnetlib.proto.utils;
import eu.dnetlib.data.proto.FieldTypeProtos;
import eu.dnetlib.data.proto.OafProtos;
public class OAFProtoUtils {
public static FieldTypeProtos.Author author(final String s, int rank) {
final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false);
final FieldTypeProtos.Author.Builder author = FieldTypeProtos.Author.newBuilder();
if (p.isAccurate()) {
author.setName(p.getNormalisedFirstName());
author.setSurname(p.getNormalisedSurname());
}
author.setFullname(p.getNormalisedFullname());
author.setRank(rank);
return author.build();
}
public static FieldTypeProtos.StructuredProperty sp(final String pid, final String type) {
FieldTypeProtos.StructuredProperty.Builder pidSp = FieldTypeProtos.StructuredProperty.newBuilder().setValue(pid)
.setQualifier(FieldTypeProtos.Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types"));
return pidSp.build();
}
public static FieldTypeProtos.StringField.Builder sf(final String s) { return FieldTypeProtos.StringField.newBuilder().setValue(s); }
public static FieldTypeProtos.StructuredProperty.Builder getStruct(final String value, final FieldTypeProtos.Qualifier.Builder qualifier) {
return FieldTypeProtos.StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier);
}
public static FieldTypeProtos.Qualifier.Builder getQualifier(final String classname, final String schemename) {
return FieldTypeProtos.Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename);
}
public static OafProtos.OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) {
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setId(id).setType(type);
return entity;
}
}

View File

@ -1,208 +0,0 @@
package eu.dnetlib.pace;
import com.google.common.collect.Lists;
import com.google.gson.Gson;
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder;
import eu.dnetlib.data.proto.OafProtos.Oaf;
import eu.dnetlib.data.proto.OafProtos.OafEntity;
import eu.dnetlib.data.proto.OrganizationProtos.Organization;
import eu.dnetlib.data.proto.ResultProtos.Result;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.model.gt.GTAuthor;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.RandomUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public abstract class AbstractProtoPaceTest extends OafTest {
protected DedupConfig getOrganizationCurrentConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf"));
}
protected DedupConfig getOrganizationTestConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.test.conf"));
}
protected MapDocument author(final Config conf, final String id, final Oaf oaf) {
return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model());
}
protected GTAuthor getGTAuthor(final String path) {
final Gson gson = new Gson();
final String json = readFromClasspath(path);
final GTAuthor gta = gson.fromJson(json, GTAuthor.class);
return gta;
}
protected String readFromClasspath(final String filename) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(getClass().getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
protected MapDocument result(final Config config, final String id, final String title) {
return result(config, id, title, null, new ArrayList<>(), null);
}
protected MapDocument result(final Config config, final String id, final String title, final String date) {
return result(config, id, title, date, new ArrayList<>(), null);
}
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid) {
return result(config, id, title, date, pid, null);
}
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) {
return result(config, id, title, date, pid, null);
}
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List<String> authors) {
return result(config, id, title, date, Lists.newArrayList(pid), authors);
}
protected MapDocument author(final String identifier, final String area, final String firstname, final String lastname, final String fullname, final Double[] topics, final String pubID, final String pubDOI, final int rank, final String orcid, final List<String> coauthors) {
Map<String, Field> fieldMap = new HashMap<>();
fieldMap.put("area", new FieldValueImpl(Type.String, "area", area));
fieldMap.put("firstname", new FieldValueImpl(Type.String, "firstname", firstname));
fieldMap.put("lastname", new FieldValueImpl(Type.String, "lastname", lastname));
fieldMap.put("fullname", new FieldValueImpl(Type.String, "fullname", fullname));
fieldMap.put("pubID", new FieldValueImpl(Type.String, "pubID", pubID));
fieldMap.put("pubDOI", new FieldValueImpl(Type.String, "pubDOI", pubDOI));
fieldMap.put("rank", new FieldValueImpl(Type.Int, "rank", rank));
fieldMap.put("orcid", new FieldValueImpl(Type.String, "orcid", orcid));
FieldListImpl ca = new FieldListImpl("coauthors", Type.String);
ca.addAll(coauthors.stream().map(s -> new FieldValueImpl(Type.String, "coauthors", s)).collect(Collectors.toList()));
fieldMap.put("coauthors", ca);
FieldListImpl t = new FieldListImpl("topics", Type.String);
t.addAll(Arrays.asList(topics).stream().map(d -> new FieldValueImpl(Type.String, "topics", d.toString())).collect(Collectors.toList()));
fieldMap.put("topics", t);
return new MapDocument(identifier, fieldMap);
}
static List<String> pidTypes = Lists.newArrayList();
static {
pidTypes.add("doi");
//pidTypes.add("oai");
//pidTypes.add("pmid");
}
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
if (!StringUtils.isBlank(title)) {
metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles")));
metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles")));
}
if (!StringUtils.isBlank(date)) {
metadata.setDateofacceptance(sf(date));
}
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
final Result.Builder result = Result.newBuilder().setMetadata(metadata);
if (authors != null) {
result.getMetadataBuilder().addAllAuthor(
IntStream.range(0, authors.size())
.mapToObj(i -> author(authors.get(i), i))
.collect(Collectors.toCollection(LinkedList::new)));
}
entity.setResult(result);
if (pid != null) {
for(String p : pid) {
if (!StringUtils.isBlank(p)) {
entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1))));
//entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
}
}
}
final OafEntity build = entity.build();
return ProtoDocumentBuilder.newInstance(id, build, config.model());
}
private Author author(final String s, int rank) {
final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false);
final Author.Builder author = Author.newBuilder();
if (p.isAccurate()) {
author.setName(p.getNormalisedFirstName());
author.setSurname(p.getNormalisedSurname());
}
author.setFullname(p.getNormalisedFullname());
author.setRank(rank);
return author.build();
}
private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) {
final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type);
return entity;
}
protected MapDocument organization(final Config config, final String id, final String legalName) {
return organization(config, id, legalName, null);
}
protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) {
final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder();
if (legalName != null) {
metadata.setLegalname(sf(legalName));
}
if (legalShortName != null) {
metadata.setLegalshortname(sf(legalShortName));
}
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
entity.setOrganization(Organization.newBuilder().setMetadata(metadata));
return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model());
}
private StructuredProperty sp(final String pid, final String type) {
final Builder pidSp =
StructuredProperty.newBuilder().setValue(pid)
.setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types"));
return pidSp.build();
}
protected Field title(final String s) {
return new FieldValueImpl(Type.String, "title", s);
}
protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) {
return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier);
}
/*
* protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); }
*
* protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return
* Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); }
*/
}

View File

@ -5,7 +5,7 @@ import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.tree.support.TreeStats;
import eu.dnetlib.pace.utils.PaceUtils;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ConnectedComponent;
import org.apache.spark.api.java.JavaPairRDD;
@ -13,7 +13,6 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import scala.Tuple2;
@ -31,7 +30,7 @@ public class DedupLocalTest extends DedupTestUtils {
@Before
public void setup() {
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.strict.conf", DedupLocalTest.class));
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.strict.conf.json", DedupLocalTest.class));
treeProcessor = new TreeProcessor(config);
final SparkSession spark = SparkSession
@ -45,7 +44,6 @@ public class DedupLocalTest extends DedupTestUtils {
}
@Ignore
@Test
public void dedupTest(){
@ -59,7 +57,6 @@ public class DedupLocalTest extends DedupTestUtils {
}
@Ignore
@Test
public void relationsTest() {
@ -115,15 +112,15 @@ public class DedupLocalTest extends DedupTestUtils {
}
@Ignore
@Test
public void matchTest(){
String JSONEntity1 = "{\"dateoftransformation\":\"2018-06-04\",\"originalId\":[\"opendoar____::Universiti_Sains_Malaysia\"],\"collectedfrom\":[{\"value\":\"OpenDOAR\",\"key\":\"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"websiteurl\":{\"value\":\"http://www.usm.my/my/\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Universiti Sains Malaysia\"},\"country\":{\"classid\":\"MY\",\"classname\":\"Malaysia\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2015-08-24\",\"type\":20,\"id\":\"20|opendoar____::04315c25b0eb56eacb967901557f86b1\"}";
String JSONEntity2 = "{\"dateoftransformation\":\"2019-10-07\",\"originalId\":[\"corda_______::997941627\"],\"collectedfrom\":[{\"value\":\"CORDA - COmmon Research DAta Warehouse\",\"key\":\"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"true\"},\"eclegalperson\":{\"value\":\"true\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"USM\"},\"ecresearchorganization\":{\"value\":\"true\"},\"ecnonprofit\":{\"value\":\"true\"},\"ecenterprise\":{\"value\":\"false\"},\"websiteurl\":{\"value\":\"http://www.usm.my/my\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"UNIVERSITI SAINS MALAYSIA*\"},\"country\":{\"classid\":\"MY\",\"classname\":\"Malaysia\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"true\"}}},\"dateofcollection\":\"2015-09-10\",\"type\":20,\"id\":\"20|corda_______::1fb0c86ddf389377454d5520d2796dad\"}";
MapDocument mapDoc1 = PaceUtils.asMapDocument(config, JSONEntity1);
MapDocument mapDoc2 = PaceUtils.asMapDocument(config, JSONEntity2);
MapDocument mapDoc1 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity1);
MapDocument mapDoc2 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity2);
TreeStats treeStats = treeProcessor.evaluateTree(mapDoc1, mapDoc2);
@ -131,12 +128,12 @@ public class DedupLocalTest extends DedupTestUtils {
}
@Ignore
@Test
public void parseJSONEntityTest(){
String jsonEntity = "{\"dateoftransformation\":\"2018-09-19\",\"originalId\":[\"doajarticles::Sociedade_Brasileira_de_Reumatologia\"],\"collectedfrom\":[{\"value\":\"DOAJ-Articles\",\"key\":\"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"country\":{\"classid\":\"BR\",\"classname\":\"Brazil\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2018-09-19\",\"type\":20,\"id\":\"20|doajarticles::0019ba7a22c5bc733c3206bde28ff568\"}";
MapDocument mapDocument = PaceUtils.asMapDocument(config, jsonEntity);
MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, jsonEntity);
System.out.println("mapDocument = " + mapDocument);
}

View File

@ -1,71 +0,0 @@
package eu.dnetlib.pace;
import org.apache.oozie.client.OozieClient;
import org.apache.oozie.client.OozieClientException;
import org.apache.oozie.client.WorkflowJob;
import org.junit.Ignore;
import org.junit.Test;
import java.io.IOException;
import java.util.Properties;
import static junit.framework.Assert.assertEquals;
public class DedupTestIT {
@Ignore
@Test
public void deduplicationTest() throws OozieClientException, InterruptedException {
//read properties to use in the oozie workflow
Properties prop = readProperties("/eu/dnetlib/test/properties/config.properties");
/*OOZIE WORKFLOW CREATION AND LAUNCH*/
// get a OozieClient for local Oozie
OozieClient wc = new OozieClient("http://hadoop-edge3.garr-pa1.d4science.org:11000/oozie");
// create a workflow job configuration and set the workflow application path
Properties conf = wc.createConfiguration();
conf.setProperty(OozieClient.APP_PATH, "hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/michele.debonis/oozieJob/workflow.xml");
conf.setProperty(OozieClient.USER_NAME, "michele.debonis");
conf.setProperty("oozie.action.sharelib.for.spark", "spark2");
conf.setProperty("oozie.use.system.libpath", "true");
// setting workflow parameters
conf.setProperty("jobTracker", "hadoop-rm3.garr-pa1.d4science.org:8032");
conf.setProperty("nameNode", "hdfs://hadoop-rm1.garr-pa1.d4science.org:8020");
conf.setProperty("dedupConfiguration", prop.getProperty("dedup.configuration"));
conf.setProperty("inputSpace", prop.getProperty("input.space"));
conf.setProperty("outputPath", prop.getProperty("output"));
conf.setProperty("statisticsPath", prop.getProperty("dedup.statistics"));
// submit and start the workflow job
String jobId = wc.run(conf);
System.out.println("Workflow job submitted");
// wait until the workflow job finishes printing the status every 10 seconds
while (wc.getJobInfo(jobId).getStatus() == WorkflowJob.Status.RUNNING) {
System.out.println(wc.getJobInfo(jobId));;
Thread.sleep(10 * 1000);
}
// print the final status of the workflow job
System.out.println(wc.getJobInfo(jobId));
// System.out.println("JOB LOG = " + wc.getJobLog(jobId));
assertEquals(WorkflowJob.Status.SUCCEEDED, wc.getJobInfo(jobId).getStatus());
}
static Properties readProperties(final String propFile) {
Properties prop = new Properties();
try {
prop.load(DedupTestIT.class.getResourceAsStream(propFile));
} catch (IOException e) {
e.printStackTrace();
}
return prop;
}
}

File diff suppressed because one or more lines are too long

View File

@ -1,41 +1,39 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Before;
import org.junit.Test;
public class ClusteringCombinerTest extends AbstractProtoPaceTest {
public class ClusteringCombinerTest {
private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class);
private Config config;
@Before
public void setUp() {
config = getOrganizationTestConf();
}
@Test
public void testCombine() {
final MapDocument organization = organization(config, "A", "University of Turin", "UNITO");
log.info("University of Turin");
log.info(ClusteringCombiner.combine(organization, config));
}
@Test
public void testCombineBlacklistAware() {
final MapDocument organization = organization(config, "A", "University of Turin", "UNITO");
log.info("University of Turin");
log.info(BlacklistAwareClusteringCombiner.filterAndCombine(organization, config));
}
// TODO RE IMPLEMENT Tests with the new configuration
// private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class);
//
// private Config config;
//
// @Before
// public void setUp() {
// config = getOrganizationTestConf();
// }
//
// @Test
// public void testCombine() {
//
// final MapDocument organization = organization(config, "A", "University of Turin", "UNITO");
// log.info("University of Turin");
// log.info(ClusteringCombiner.combine(organization, config));
// }
//
// @Test
// public void testCombineBlacklistAware() {
//
// final MapDocument organization = organization(config, "A", "University of Turin", "UNITO");
// log.info("University of Turin");
// log.info(BlacklistAwareClusteringCombiner.filterAndCombine(organization, config));
// }
}

View File

@ -1,46 +0,0 @@
package eu.dnetlib.pace.model;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView;
import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Test;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest {
private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class);
@Test
public void test_serialise1() {
final String id = "12345";
final Config config = getOrganizationTestConf();
final MapDocument document = ProtoDocumentBuilder.newInstance(id, getOrganization(id), config.model());
assertFalse(document.fieldNames().isEmpty());
assertFalse(Iterables.isEmpty(document.fields()));
log.info("original:\n" + document);
final String stringDoc = MapDocumentSerializer.toString(document);
log.info("serialization:\n" + stringDoc);
final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes());
final SetView<String> diff = Sets.difference(document.fieldNames(), decoded.fieldNames());
assertTrue(diff.isEmpty());
log.info("decoded:\n" + decoded);
}
}

View File

@ -7,6 +7,7 @@
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"idPath":"$.id",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true",
"maxIterations": "20"
@ -185,12 +186,12 @@
}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"},
{ "name" : "originalId", "type" : "String", "path" : "id" }
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
],
"blacklists" : {
"legalname" : []
@ -301,7 +302,7 @@
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
"key::106" : ["seminary", "seminario", "seminaire", "seminar"]
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
"key::107" : ["agricultural forestry", "af", "a f", "a&f"],
"key::108" : ["agricultural mechanical", "am", "a m", "a&m"]
}

View File

@ -7,6 +7,7 @@
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"idPath":"$.id",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true",
"maxIterations": "20"
@ -24,11 +25,13 @@
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : ".organization.metadata.country.classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : ".organization.metadata.legalshortname.value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : ".organization.metadata.legalname.value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : ".organization.metadata.websiteurl.value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".pid[] | select(.qualifier.classid == \"grid\") | .value" }
],
"blacklists" : {
"legalname" : ["University of Turin"]

View File

@ -1,109 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project version="4" relativePaths="false">
<component name="ProjectRootManager" version="2" assert-keyword="true" project-jdk-name="1.8" jdk-15="true"/>
<component name="CodeStyleManager">
<option name="USE_DEFAULT_CODE_STYLE_SCHEME" value="true"/>
<option name="CODE_STYLE_SCHEME" value=""/>
</component>
<component name="libraryTable"/>
<component name="CompilerConfiguration">
<option name="DEFAULT_COMPILER" value="Javac"/>
<option name="CLEAR_OUTPUT_DIRECTORY" value="false"/>
<!--
<wildcardResourcePatterns>
<entry name="${wildcardResourcePattern}"/>
</wildcardResourcePatterns>
-->
<wildcardResourcePatterns>
<entry name="!?*.java"/>
</wildcardResourcePatterns>
</component>
<component name="JavacSettings">
<option name="DEBUGGING_INFO" value="true"/>
<option name="GENERATE_NO_WARNINGS" value="false"/>
<option name="DEPRECATION" value="true"/>
<option name="ADDITIONAL_OPTIONS_STRING" value=""/>
<option name="MAXIMUM_HEAP_SIZE" value="128"/>
<option name="USE_GENERICS_COMPILER" value="false"/>
</component>
<component name="JikesSettings">
<option name="DEBUGGING_INFO" value="true"/>
<option name="DEPRECATION" value="true"/>
<option name="GENERATE_NO_WARNINGS" value="false"/>
<option name="GENERATE_MAKE_FILE_DEPENDENCIES" value="false"/>
<option name="DO_FULL_DEPENDENCE_CHECK" value="false"/>
<option name="IS_INCREMENTAL_MODE" value="false"/>
<option name="IS_EMACS_ERRORS_MODE" value="true"/>
<option name="ADDITIONAL_OPTIONS_STRING" value=""/>
<option name="MAXIMUM_HEAP_SIZE" value="128"/>
</component>
<component name="AntConfiguration">
<option name="IS_AUTOSCROLL_TO_SOURCE" value="false"/>
<option name="FILTER_TARGETS" value="false"/>
</component>
<component name="JavadocGenerationManager">
<option name="OUTPUT_DIRECTORY"/>
<option name="OPTION_SCOPE" value="protected"/>
<option name="OPTION_HIERARCHY" value="false"/>
<option name="OPTION_NAVIGATOR" value="false"/>
<option name="OPTION_INDEX" value="false"/>
<option name="OPTION_SEPARATE_INDEX" value="false"/>
<option name="OPTION_USE_1_1" value="false"/>
<option name="OPTION_DOCUMENT_TAG_USE" value="false"/>
<option name="OPTION_DOCUMENT_TAG_AUTHOR" value="false"/>
<option name="OPTION_DOCUMENT_TAG_VERSION" value="false"/>
<option name="OPTION_DOCUMENT_TAG_DEPRECATED" value="false"/>
<option name="OPTION_DEPRECATED_LIST" value="false"/>
<option name="OTHER_OPTIONS"/>
<option name="HEAP_SIZE"/>
<option name="OPEN_IN_BROWSER" value="false"/>
</component>
<component name="JUnitProjectSettings">
<option name="TEST_RUNNER" value="UI"/>
</component>
<component name="EntryPointsManager">
<entry_points/>
</component>
<component name="DataSourceManager"/>
<component name="ExportToHTMLSettings">
<option name="PRINT_LINE_NUMBERS" value="false"/>
<option name="OPEN_IN_BROWSER" value="false"/>
<option name="OUTPUT_DIRECTORY"/>
</component>
<component name="ImportConfiguration">
<option name="VENDOR"/>
<option name="RELEASE_TAG"/>
<option name="LOG_MESSAGE"/>
<option name="CHECKOUT_AFTER_IMPORT" value="true"/>
</component>
<component name="ProjectModuleManager">
<modules>
<!-- module filepath="$$PROJECT_DIR$$/${pom.artifactId}.iml"/ -->
<module filepath="$PROJECT_DIR$/dnet-dedup.iml"/>
<module filepath="$PROJECT_DIR$/dnet-pace-core/dnet-pace-core.iml"/>
<module filepath="$PROJECT_DIR$/dnet-dedup-test/dnet-dedup-test.iml"/>
</modules>
</component>
<UsedPathMacros>
<!--<macro name="cargo"></macro>-->
</UsedPathMacros>
</project>

View File

@ -1,418 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project version="4" relativePaths="false">
<component name="LvcsProjectConfiguration">
<option name="ADD_LABEL_ON_PROJECT_OPEN" value="true"/>
<option name="ADD_LABEL_ON_PROJECT_COMPILATION" value="true"/>
<option name="ADD_LABEL_ON_FILE_PACKAGE_COMPILATION" value="true"/>
<option name="ADD_LABEL_ON_PROJECT_MAKE" value="true"/>
<option name="ADD_LABEL_ON_RUNNING" value="true"/>
<option name="ADD_LABEL_ON_DEBUGGING" value="true"/>
<option name="ADD_LABEL_ON_UNIT_TEST_PASSED" value="true"/>
<option name="ADD_LABEL_ON_UNIT_TEST_FAILED" value="true"/>
</component>
<component name="PropertiesComponent">
<property name="MemberChooser.copyJavadoc" value="false"/>
<property name="GoToClass.includeLibraries" value="false"/>
<property name="MemberChooser.showClasses" value="true"/>
<property name="MemberChooser.sorted" value="false"/>
<property name="GoToFile.includeJavaFiles" value="false"/>
<property name="GoToClass.toSaveIncludeLibraries" value="false"/>
</component>
<component name="ToolWindowManager">
<frame x="-4" y="-4" width="1032" height="746" extended-state="6"/>
<editor active="false"/>
<layout>
<window_info id="CVS" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.33" order="-1"/>
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.33" order="7"/>
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.25" order="0"/>
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.33" order="1"/>
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.25" order="1"/>
<window_info id="Messages" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.33" order="-1"/>
<window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.4" order="6"/>
<window_info id="Aspects" active="false" anchor="right" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.33" order="-1"/>
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.25" order="1"/>
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.33" order="2"/>
<window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.25" order="2"/>
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.4" order="4"/>
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="sliding" type="sliding" visible="false" weight="0.4" order="0"/>
<window_info id="Web" active="false" anchor="left" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.25" order="2"/>
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.33" order="0"/>
<window_info id="EJB" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.25" order="3"/>
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="docked" type="docked" visible="false" weight="0.25" order="5"/>
</layout>
</component>
<component name="ErrorTreeViewConfiguration">
<option name="IS_AUTOSCROLL_TO_SOURCE" value="false"/>
<option name="HIDE_WARNINGS" value="false"/>
</component>
<component name="StructureViewFactory">
<option name="SORT_MODE" value="0"/>
<option name="GROUP_INHERITED" value="true"/>
<option name="AUTOSCROLL_MODE" value="true"/>
<option name="SHOW_FIELDS" value="true"/>
<option name="AUTOSCROLL_FROM_SOURCE" value="false"/>
<option name="GROUP_GETTERS_AND_SETTERS" value="true"/>
<option name="SHOW_INHERITED" value="false"/>
<option name="HIDE_NOT_PUBLIC" value="false"/>
</component>
<component name="ProjectViewSettings">
<navigator currentView="ProjectPane" flattenPackages="false" showMembers="false" showStructure="false" autoscrollToSource="false" splitterProportion="0.5"/>
<view id="ProjectPane">
<expanded_node type="directory" url="file://$PROJECT_DIR$"/>
</view>
<view id="SourcepathPane"/>
<view id="ClasspathPane"/>
</component>
<component name="Commander">
<leftPanel view="Project"/>
<rightPanel view="Project"/>
<splitter proportion="0.5"/>
</component>
<component name="AspectsView"/>
<component name="SelectInManager"/>
<component name="HierarchyBrowserManager">
<option name="SHOW_PACKAGES" value="false"/>
<option name="IS_AUTOSCROLL_TO_SOURCE" value="false"/>
<option name="SORT_ALPHABETICALLY" value="false"/>
</component>
<component name="TodoView" selected-index="0">
<todo-panel id="selected-file">
<are-packages-shown value="false"/>
<flatten-packages value="false"/>
<is-autoscroll-to-source value="true"/>
</todo-panel>
<todo-panel id="all">
<are-packages-shown value="true"/>
<flatten-packages value="false"/>
<is-autoscroll-to-source value="true"/>
</todo-panel>
</component>
<component name="editorManager"/>
<component name="editorHistoryManager"/>
<component name="DaemonCodeAnalyzer">
<disable_hints/>
</component>
<component name="InspectionManager">
<option name="AUTOSCROLL_TO_SOURCE" value="false"/>
<option name="SPLITTER_PROPORTION" value="0.5"/>
<profile name="Default"/>
</component>
<component name="BookmarkManager"/>
<component name="DebuggerManager">
<line_breakpoints/>
<exception_breakpoints>
<breakpoint_any>
<option name="NOTIFY_CAUGHT" value="true"/>
<option name="NOTIFY_UNCAUGHT" value="true"/>
<option name="ENABLED" value="false"/>
<option name="SUSPEND_VM" value="true"/>
<option name="COUNT_FILTER_ENABLED" value="false"/>
<option name="COUNT_FILTER" value="0"/>
<option name="CONDITION_ENABLED" value="false"/>
<option name="CONDITION"/>
<option name="LOG_ENABLED" value="false"/>
<option name="LOG_EXPRESSION_ENABLED" value="false"/>
<option name="LOG_MESSAGE"/>
<option name="CLASS_FILTERS_ENABLED" value="false"/>
<option name="INVERSE_CLASS_FILLTERS" value="false"/>
<option name="SUSPEND_POLICY" value="SuspendAll"/>
</breakpoint_any>
</exception_breakpoints>
<field_breakpoints/>
<method_breakpoints/>
</component>
<component name="DebuggerSettings">
<option name="TRACING_FILTERS_ENABLED" value="true"/>
<option name="TOSTRING_CLASSES_ENABLED" value="false"/>
<option name="VALUE_LOOKUP_DELAY" value="700"/>
<option name="DEBUGGER_TRANSPORT" value="0"/>
<option name="FORCE_CLASSIC_VM" value="true"/>
<option name="HIDE_DEBUGGER_ON_PROCESS_TERMINATION" value="false"/>
<option name="SKIP_SYNTHETIC_METHODS" value="true"/>
<option name="SKIP_CONSTRUCTORS" value="false"/>
<option name="STEP_THREAD_SUSPEND_POLICY" value="SuspendThread"/>
<default_breakpoint_settings>
<option name="NOTIFY_CAUGHT" value="true"/>
<option name="NOTIFY_UNCAUGHT" value="true"/>
<option name="WATCH_MODIFICATION" value="true"/>
<option name="WATCH_ACCESS" value="true"/>
<option name="WATCH_ENTRY" value="true"/>
<option name="WATCH_EXIT" value="true"/>
<option name="ENABLED" value="true"/>
<option name="SUSPEND_VM" value="true"/>
<option name="COUNT_FILTER_ENABLED" value="false"/>
<option name="COUNT_FILTER" value="0"/>
<option name="CONDITION_ENABLED" value="false"/>
<option name="CONDITION"/>
<option name="LOG_ENABLED" value="false"/>
<option name="LOG_EXPRESSION_ENABLED" value="false"/>
<option name="LOG_MESSAGE"/>
<option name="CLASS_FILTERS_ENABLED" value="false"/>
<option name="INVERSE_CLASS_FILLTERS" value="false"/>
<option name="SUSPEND_POLICY" value="SuspendAll"/>
</default_breakpoint_settings>
<filter>
<option name="PATTERN" value="com.sun.*"/>
<option name="ENABLED" value="true"/>
</filter>
<filter>
<option name="PATTERN" value="java.*"/>
<option name="ENABLED" value="true"/>
</filter>
<filter>
<option name="PATTERN" value="javax.*"/>
<option name="ENABLED" value="true"/>
</filter>
<filter>
<option name="PATTERN" value="org.omg.*"/>
<option name="ENABLED" value="true"/>
</filter>
<filter>
<option name="PATTERN" value="sun.*"/>
<option name="ENABLED" value="true"/>
</filter>
<filter>
<option name="PATTERN" value="junit.*"/>
<option name="ENABLED" value="true"/>
</filter>
</component>
<component name="CompilerWorkspaceConfiguration">
<option name="COMPILE_IN_BACKGROUND" value="false"/>
<option name="AUTO_SHOW_ERRORS_IN_EDITOR" value="true"/>
</component>
<component name="RunManager">
<activeType name="Application"/>
<configuration selected="false" default="true" type="Applet" factoryName="Applet">
<module name=""/>
<option name="MAIN_CLASS_NAME"/>
<option name="HTML_FILE_NAME"/>
<option name="HTML_USED" value="false"/>
<option name="WIDTH" value="400"/>
<option name="HEIGHT" value="300"/>
<option name="POLICY_FILE" value="$APPLICATION_HOME_DIR$/bin/appletviewer.policy"/>
<option name="VM_PARAMETERS"/>
</configuration>
<configuration selected="false" default="true" type="Remote" factoryName="Remote">
<option name="USE_SOCKET_TRANSPORT" value="true"/>
<option name="SERVER_MODE" value="false"/>
<option name="SHMEM_ADDRESS" value="javadebug"/>
<option name="HOST" value="localhost"/>
<option name="PORT" value="5005"/>
</configuration>
<configuration selected="false" default="true" type="Application" factoryName="Application">
<option name="MAIN_CLASS_NAME"/>
<option name="VM_PARAMETERS"/>
<option name="PROGRAM_PARAMETERS"/>
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$"/>
<module name=""/>
</configuration>
<configuration selected="false" default="true" type="JUnit" factoryName="JUnit">
<module name=""/>
<option name="PACKAGE_NAME"/>
<option name="MAIN_CLASS_NAME"/>
<option name="METHOD_NAME"/>
<option name="TEST_OBJECT" value="class"/>
<option name="VM_PARAMETERS"/>
<option name="PARAMETERS"/>
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$"/>
<option name="ADDITIONAL_CLASS_PATH"/>
<option name="TEST_SEARCH_SCOPE">
<value defaultName="wholeProject"/>
</option>
</configuration>
</component>
<component name="VcsManagerConfiguration">
<option name="ACTIVE_VCS_NAME" value="git"/>
<option name="STATE" value="0"/>
</component>
<component name="VssConfiguration">
<CheckoutOptions>
<option name="COMMENT" value=""/>
<option name="DO_NOT_GET_LATEST_VERSION" value="false"/>
<option name="REPLACE_WRITABLE" value="false"/>
<option name="RECURSIVE" value="false"/>
</CheckoutOptions>
<CheckinOptions>
<option name="COMMENT" value=""/>
<option name="KEEP_CHECKED_OUT" value="false"/>
<option name="RECURSIVE" value="false"/>
</CheckinOptions>
<AddOptions>
<option name="COMMENT" value=""/>
<option name="STORE_ONLY_LATEST_VERSION" value="false"/>
<option name="CHECK_OUT_IMMEDIATELY" value="false"/>
<option name="FILE_TYPE" value="0"/>
</AddOptions>
<UndocheckoutOptions>
<option name="MAKE_WRITABLE" value="false"/>
<option name="REPLACE_LOCAL_COPY" value="0"/>
<option name="RECURSIVE" value="false"/>
</UndocheckoutOptions>
<DiffOptions>
<option name="IGNORE_WHITE_SPACE" value="false"/>
<option name="IGNORE_CASE" value="false"/>
</DiffOptions>
<GetOptions>
<option name="REPLACE_WRITABLE" value="0"/>
<option name="MAKE_WRITABLE" value="false"/>
<option name="RECURSIVE" value="false"/>
</GetOptions>
<option name="CLIENT_PATH" value=""/>
<option name="SRCSAFEINI_PATH" value=""/>
<option name="USER_NAME" value=""/>
<option name="PWD" value=""/>
<option name="SHOW_CHECKOUT_OPTIONS" value="true"/>
<option name="SHOW_ADD_OPTIONS" value="true"/>
<option name="SHOW_UNDOCHECKOUT_OPTIONS" value="true"/>
<option name="SHOW_DIFF_OPTIONS" value="true"/>
<option name="SHOW_GET_OPTIONS" value="true"/>
<option name="USE_EXTERNAL_DIFF" value="false"/>
<option name="EXTERNAL_DIFF_PATH" value=""/>
<option name="REUSE_LAST_COMMENT" value="false"/>
<option name="PUT_FOCUS_INTO_COMMENT" value="false"/>
<option name="SHOW_CHECKIN_OPTIONS" value="true"/>
<option name="LAST_COMMIT_MESSAGE" value=""/>
<option name="CHECKIN_DIALOG_SPLITTER_PROPORTION" value="0.8"/>
</component>
<component name="CheckinPanelState"/>
<component name="WebViewSettings">
<webview flattenPackages="false" showMembers="false" autoscrollToSource="false"/>
</component>
<component name="EjbViewSettings">
<EjbView showMembers="false" autoscrollToSource="false"/>
</component>
<component name="AppServerRunManager"/>
<component name="StarteamConfiguration">
<option name="SERVER" value=""/>
<option name="PORT" value="49201"/>
<option name="USER" value=""/>
<option name="PASSWORD" value=""/>
<option name="PROJECT" value=""/>
<option name="VIEW" value=""/>
<option name="ALTERNATIVE_WORKING_PATH" value=""/>
<option name="PUT_FOCUS_INTO_COMMENT" value="false"/>
<option name="SHOW_CHECKIN_OPTIONS" value="true"/>
<option name="LAST_COMMIT_MESSAGE" value=""/>
<option name="CHECKIN_DIALOG_SPLITTER_PROPORTION" value="0.8"/>
</component>
<component name="Cvs2Configuration">
<option name="ON_FILE_ADDING" value="0"/>
<option name="ON_FILE_REMOVING" value="0"/>
<option name="PRUNE_EMPTY_DIRECTORIES" value="true"/>
<option name="SHOW_UPDATE_OPTIONS" value="true"/>
<option name="SHOW_ADD_OPTIONS" value="true"/>
<option name="SHOW_REMOVE_OPTIONS" value="true"/>
<option name="MERGING_MODE" value="0"/>
<option name="MERGE_WITH_BRANCH1_NAME" value="HEAD"/>
<option name="MERGE_WITH_BRANCH2_NAME" value="HEAD"/>
<option name="RESET_STICKY" value="false"/>
<option name="CREATE_NEW_DIRECTORIES" value="true"/>
<option name="DEFAULT_TEXT_FILE_SUBSTITUTION" value="kv"/>
<option name="PROCESS_UNKNOWN_FILES" value="false"/>
<option name="PROCESS_DELETED_FILES" value="false"/>
<option name="SHOW_EDIT_DIALOG" value="true"/>
<option name="RESERVED_EDIT" value="false"/>
<option name="FILE_HISTORY_SPLITTER_PROPORTION" value="0.6"/>
<option name="SHOW_CHECKOUT_OPTIONS" value="true"/>
<option name="CHECKOUT_DATE_OR_REVISION_SETTINGS">
<value>
<option name="BRANCH" value=""/>
<option name="DATE" value=""/>
<option name="USE_BRANCH" value="false"/>
<option name="USE_DATE" value="false"/>
</value>
</option>
<option name="UPDATE_DATE_OR_REVISION_SETTINGS">
<value>
<option name="BRANCH" value=""/>
<option name="DATE" value=""/>
<option name="USE_BRANCH" value="false"/>
<option name="USE_DATE" value="false"/>
</value>
</option>
<option name="SHOW_CHANGES_REVISION_SETTINGS">
<value>
<option name="BRANCH" value=""/>
<option name="DATE" value=""/>
<option name="USE_BRANCH" value="false"/>
<option name="USE_DATE" value="false"/>
</value>
</option>
<option name="SHOW_OUTPUT" value="false"/>
<option name="SHOW_FILE_HISTORY_AS_TREE" value="false"/>
<option name="UPDATE_GROUP_BY_PACKAGES" value="false"/>
<option name="ADD_WATCH_INDEX" value="0"/>
<option name="REMOVE_WATCH_INDEX" value="0"/>
<option name="UPDATE_KEYWORD_SUBSTITUTION"/>
<option name="MAKE_NEW_FILES_READONLY" value="false"/>
<option name="SHOW_CORRUPTED_PROJECT_FILES" value="0"/>
<option name="TAG_AFTER_FILE_COMMIT" value="false"/>
<option name="TAG_AFTER_FILE_COMMIT_NAME" value=""/>
<option name="TAG_AFTER_PROJECT_COMMIT" value="false"/>
<option name="TAG_AFTER_PROJECT_COMMIT_NAME" value=""/>
<option name="PUT_FOCUS_INTO_COMMENT" value="false"/>
<option name="SHOW_CHECKIN_OPTIONS" value="true"/>
<option name="FORCE_NON_EMPTY_COMMENT" value="false"/>
<option name="LAST_COMMIT_MESSAGE" value=""/>
<option name="SAVE_LAST_COMMIT_MESSAGE" value="true"/>
<option name="CHECKIN_DIALOG_SPLITTER_PROPORTION" value="0.8"/>
<option name="OPTIMIZE_IMPORTS_BEFORE_PROJECT_COMMIT" value="false"/>
<option name="OPTIMIZE_IMPORTS_BEFORE_FILE_COMMIT" value="false"/>
<option name="REFORMAT_BEFORE_PROJECT_COMMIT" value="false"/>
<option name="REFORMAT_BEFORE_FILE_COMMIT" value="false"/>
<option name="FILE_HISTORY_DIALOG_COMMENTS_SPLITTER_PROPORTION" value="0.8"/>
<option name="FILE_HISTORY_DIALOG_SPLITTER_PROPORTION" value="0.5"/>
</component>
<component name="CvsTabbedWindow"/>
<component name="SvnConfiguration">
<option name="USER" value=""/>
<option name="PASSWORD" value=""/>
<option name="AUTO_ADD_FILES" value="0"/>
<option name="AUTO_DEL_FILES" value="0"/>
</component>
<component name="PerforceConfiguration">
<option name="PORT" value="magic:1666"/>
<option name="USER" value=""/>
<option name="PASSWORD" value=""/>
<option name="CLIENT" value=""/>
<option name="TRACE" value="false"/>
<option name="PERFORCE_STATUS" value="true"/>
<option name="CHANGELIST_OPTION" value="false"/>
<option name="SYSTEMROOT" value=""/>
<option name="P4_EXECUTABLE" value="p4"/>
<option name="SHOW_BRANCH_HISTORY" value="false"/>
<option name="GENERATE_COMMENT" value="false"/>
<option name="SYNC_OPTION" value="Sync"/>
<option name="PUT_FOCUS_INTO_COMMENT" value="false"/>
<option name="SHOW_CHECKIN_OPTIONS" value="true"/>
<option name="FORCE_NON_EMPTY_COMMENT" value="true"/>
<option name="LAST_COMMIT_MESSAGE" value=""/>
<option name="SAVE_LAST_COMMIT_MESSAGE" value="true"/>
<option name="CHECKIN_DIALOG_SPLITTER_PROPORTION" value="0.8"/>
<option name="OPTIMIZE_IMPORTS_BEFORE_PROJECT_COMMIT" value="false"/>
<option name="OPTIMIZE_IMPORTS_BEFORE_FILE_COMMIT" value="false"/>
<option name="REFORMAT_BEFORE_PROJECT_COMMIT" value="false"/>
<option name="REFORMAT_BEFORE_FILE_COMMIT" value="false"/>
<option name="FILE_HISTORY_DIALOG_COMMENTS_SPLITTER_PROPORTION" value="0.8"/>
<option name="FILE_HISTORY_DIALOG_SPLITTER_PROPORTION" value="0.5"/>
</component>
</project>

View File

@ -38,10 +38,6 @@
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
</dependency>
<dependency>
<groupId>com.googlecode.protobuf-java-format</groupId>
<artifactId>protobuf-java-format</artifactId>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
@ -59,22 +55,22 @@
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -1,25 +1,25 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.function.BiFunction;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.codehaus.jackson.map.ObjectMapper;
public class DedupConfig implements Config, Serializable {

View File

@ -1,12 +1,13 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceResolver;
import org.codehaus.jackson.annotate.JsonIgnore;
import java.io.Serializable;
import java.util.List;

View File

@ -1,17 +1,17 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang.StringUtils;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.gson.GsonBuilder;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.map.ObjectMapper;
public class WfConfig implements Serializable {
@ -76,12 +76,17 @@ public class WfConfig implements Serializable {
/** Maximum number of allowed children. */
private int maxChildren = MAX_CHILDREN;
/** Default maximum number of iterations. */
private final static int MAX_ITERATIONS = 20;
/** Maximum number of iterations */
private int maxIterations = MAX_ITERATIONS;
/** The Jquery path to retrieve the identifier */
private String idPath = "$.id";
public WfConfig() {}
/**
@ -252,6 +257,7 @@ public class WfConfig implements Serializable {
this.maxChildren = maxChildren;
}
public int getMaxIterations() {
return maxIterations;
}
@ -260,6 +266,15 @@ public class WfConfig implements Serializable {
this.maxIterations = maxIterations;
}
public String getIdPath() {
return idPath;
}
public void setIdPath(String idPath) {
this.idPath = idPath;
}
/*
* (non-Javadoc)
*

View File

@ -1,19 +1,15 @@
package eu.dnetlib.pace.model;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import eu.dnetlib.pace.clustering.*;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import eu.dnetlib.pace.util.PaceResolver;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.codehaus.jackson.map.ObjectMapper;
public class ClusteringDef implements Serializable {

View File

@ -1,7 +1,8 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;

View File

@ -1,8 +1,9 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;

View File

@ -1,10 +1,11 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;

View File

@ -1,7 +1,7 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.util.HashMap;

View File

@ -36,6 +36,18 @@ public class BlockProcessor {
this.dedupConf = dedupConf;
}
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context) {
if (documents.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
//process(q, context);
process(prepare(documents), context);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
}
}
public void process(final String key, final Iterable<MapDocument> documents, final Reporter context) {
final Queue<MapDocument> q = prepare(documents);

View File

@ -0,0 +1,109 @@
package eu.dnetlib.pace.util;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import net.minidev.json.JSONArray;
import java.util.*;
import java.util.function.Predicate;
public class MapDocumentUtil {
private static final ObjectMapper mapper = new ObjectMapper();
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) {
MapDocument m = new MapDocument();
m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json));
Map<String, Field> stringField = new HashMap<>();
conf.getPace().getModel().forEach(fdef -> {
switch (fdef.getType()) {
case String:
case Int:
stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), getJPathString(fdef.getPath(), json)));
break;
case URL:
String uv = getJPathString(fdef.getPath(), json);
if (!urlFilter.test(uv)) uv = "";
stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv));
break;
case List:
case JSON:
FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType());
getJPathList(fdef.getPath(), json, fdef.getType())
.stream()
.map(item -> new FieldValueImpl(fdef.getType(), fdef.getName(), item))
.forEach(fi::add);
stringField.put(fdef.getName(), fi);
break;
}
});
m.setFieldMap(stringField);
return m;
}
public static List<String> getJPathList(String path, String json, Type type) {
if (type == Type.List)
return JsonPath.read(json, path);
Object jresult;
List<String> result = new ArrayList<>();
try {
jresult = JsonPath.read(json, path);
} catch (Throwable e) {
return result;
}
if (jresult instanceof JSONArray) {
((JSONArray) jresult).forEach(it -> {
try {
result.add(new ObjectMapper().writeValueAsString(it));
} catch (JsonProcessingException e) {
}
}
);
return result;
}
if (jresult instanceof LinkedHashMap) {
try {
result.add(new ObjectMapper().writeValueAsString(jresult));
} catch (JsonProcessingException e) {
}
return result;
}
if (jresult instanceof String) {
result.add((String) jresult);
}
return result;
}
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String)o;
if (o instanceof JSONArray && ((JSONArray)o).size()>0)
return (String)((JSONArray)o).get(0);
return "";
} catch (Exception e) {
return "";
}
}
}

View File

@ -1,6 +1,10 @@
package eu.dnetlib.pace.config;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import java.util.Map;
@ -57,4 +61,27 @@ public class ConfigTest extends AbstractPaceTest {
assertEquals(0, load.getPace().translationMap().keySet().size());
}
@Test
public void testAsMapDocumentJPath() throws Exception {
DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf_jpath.json"));
System.out.println(load.getWf().getIdPath());
final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json"));
System.out.println(result);
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(load, result);
System.out.println(mapDocument.getFieldMap());
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,48 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"idPath": "$.entity.id",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"decisionTree": {},
"model" : [
{ "name" : "pid", "type" : "JSON", "path" : "$.entity.pid"},
{ "name" : "dateofacceptance", "type" : "String", "path" : "$.entity.result.metadata.dateofacceptance.value"},
{ "name" : "title", "type" : "String","path" : "$.entity.result.metadata.title[?(@.qualifier.classid ==\"main title\")].value" },
{ "name" : "authors", "type" : "List", "path" : "$.entity.result.metadata.author[*].fullname" }
],
"blacklists" : {
"title" : [
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*"
] } ,
"synonyms": {}
}
}

44
pom.xml
View File

@ -84,6 +84,16 @@
</snapshots>
</repository>
<repository>
<id>central</id>
<name>Central Repository</name>
<url>http://repo.maven.apache.org/maven2</url>
<releases>
<enabled>true</enabled>
</releases>
</repository>
</repositories>
<build>
@ -246,21 +256,6 @@
<artifactId>stringtemplate</artifactId>
<version>3.2</version>
</dependency>
<dependency>
<groupId>com.googlecode.protobuf-java-format</groupId>
<artifactId>protobuf-java-format</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-data-protos</artifactId>
<version>3.9.3-proto250</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
<version>6.2.21</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
@ -269,10 +264,17 @@
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.9.13</version>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-xml</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-jsonSchema</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
@ -351,6 +353,12 @@
<artifactId>oozie-client</artifactId>
<version>5.1.0</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>2.4.0</version>
</dependency>
</dependencies>
</dependencyManagement>