diff --git a/.gitignore b/.gitignore index 8ccf55e..486eace 100644 --- a/.gitignore +++ b/.gitignore @@ -1,43 +1,21 @@ -*~ - -# Compiled class file -*.class - -# Log file -*.log - -# BlueJ files -*.ctxt - -# Mobile Tools for Java (J2ME) -.mtj.tmp/ - - - -*target - -# Package Files # -*.jar -*.war -*.nar -*.ear -*.zip -*.tar.gz -*.rar - - -*.idea -*.iml - .DS_Store -**/.DS_Store - -.project +.idea +*.iml +*~ .classpath +/*/.classpath +/*/*/.classpath +.metadata +/*/.metadata +/*/*/.metadata +.project +.log .settings -**/.project -**/.classpath -**/.settings - -# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml -hs_err_pid* +/*/*/target +/*/target +/target +/*/*/build +/*/build +/build +spark-warehouse +/dhp-workflows/dhp-graph-mapper/job-override.properties diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 608b536..724ca9f 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -15,32 +15,31 @@ - - - org.apache.maven.plugins - maven-shade-plugin - 2.4.3 - - - package - - shade - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + org.apache.maven.plugins @@ -114,10 +113,6 @@ ${project.version} - - eu.dnetlib - dnet-openaire-data-protos - org.apache.spark @@ -133,12 +128,6 @@ spark-sql_2.11 - - eu.dnetlib - dnet-openaireplus-mapping-utils - - - junit junit @@ -150,12 +139,6 @@ jackson-databind - - org.apache.oozie - oozie-client - test - - org.scala-lang scala-library diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java index b180eac..877b1da 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java @@ -4,7 +4,7 @@ import eu.dnetlib.graph.GraphProcessor; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; -import eu.dnetlib.pace.utils.PaceUtils; +import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.reporter.SparkReporter; import eu.dnetlib.support.ConnectedComponent; @@ -99,7 +99,7 @@ public class Deduper implements Serializable { */ public static JavaPairRDD mapToVertexes(JavaSparkContext context, JavaRDD entities, DedupConfig config){ return entities.mapToPair(it -> { - MapDocument mapDocument = PaceUtils.asMapDocument(config, it); + MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, it); return new Tuple2<>(mapDocument.getIdentifier(), mapDocument); }); } diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/PaceUtils.java b/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/PaceUtils.java deleted file mode 100644 index c8b27c0..0000000 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/PaceUtils.java +++ /dev/null @@ -1,92 +0,0 @@ -package eu.dnetlib.pace.utils; - -import com.google.common.collect.Lists; -import com.googlecode.protobuf.format.JsonFormat; -import eu.dnetlib.data.proto.OafProtos; -import eu.dnetlib.data.proto.ResultProtos; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.ProtoDocumentBuilder; -import org.apache.commons.lang3.RandomStringUtils; -import org.apache.commons.lang3.StringUtils; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -import static eu.dnetlib.proto.utils.OAFProtoUtils.*; -import static eu.dnetlib.proto.utils.OAFProtoUtils.author; -import static eu.dnetlib.proto.utils.OAFProtoUtils.sp; - -public class PaceUtils implements Serializable { - - public static MapDocument result(final Config config, final String id, final String title) { - return result(config, id, title, null, new ArrayList<>(), null); - } - - public static MapDocument result(final Config config, final String id, final String title, final String date) { - return result(config, id, title, date, new ArrayList<>(), null); - } - - public static MapDocument result(final Config config, final String id, final String title, final String date, final List pid) { - return result(config, id, title, date, pid, null); - } - - public static MapDocument result(final Config config, final String id, final String title, final String date, final String pid) { - return result(config, id, title, date, pid, null); - } - - public static MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List authors) { - return result(config, id, title, date, Lists.newArrayList(pid), authors); - } - - public static MapDocument result(final Config config, final String id, final String title, final String date, final List pid, final List authors) { - final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder(); - if (!StringUtils.isBlank(title)) { - metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles"))); - metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles"))); - } - if (!StringUtils.isBlank(date)) { - metadata.setDateofacceptance(sf(date)); - } - - final OafProtos.OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); - final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder().setMetadata(metadata); - - if (authors != null) { - result.getMetadataBuilder().addAllAuthor( - IntStream.range(0, authors.size()) - .mapToObj(i -> author(authors.get(i), i)) - .collect(Collectors.toCollection(LinkedList::new))); - } - - entity.setResult(result); - - if (pid != null) { - for (String p : pid) { - if (!StringUtils.isBlank(p)) { - entity.addPid(sp(p, "doi")); - //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai")); - } - } - } - - final OafProtos.OafEntity build = entity.build(); - return ProtoDocumentBuilder.newInstance(id, build, config.model()); - } - - public static MapDocument asMapDocument(DedupConfig conf, final String json) { - OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder(); - try { - JsonFormat.merge(json, b); - } catch (JsonFormat.ParseException e) { - System.out.println("**************************** " + json); - throw new IllegalArgumentException(e); - } - return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel()); - } -} diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/proto/utils/OAFProtoUtils.java b/dnet-dedup-test/src/main/java/eu/dnetlib/proto/utils/OAFProtoUtils.java deleted file mode 100644 index b920b46..0000000 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/proto/utils/OAFProtoUtils.java +++ /dev/null @@ -1,43 +0,0 @@ -package eu.dnetlib.proto.utils; - -import eu.dnetlib.data.proto.FieldTypeProtos; -import eu.dnetlib.data.proto.OafProtos; - -public class OAFProtoUtils { - - - public static FieldTypeProtos.Author author(final String s, int rank) { - final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false); - final FieldTypeProtos.Author.Builder author = FieldTypeProtos.Author.newBuilder(); - if (p.isAccurate()) { - author.setName(p.getNormalisedFirstName()); - author.setSurname(p.getNormalisedSurname()); - } - author.setFullname(p.getNormalisedFullname()); - author.setRank(rank); - - return author.build(); - } - - public static FieldTypeProtos.StructuredProperty sp(final String pid, final String type) { - FieldTypeProtos.StructuredProperty.Builder pidSp = FieldTypeProtos.StructuredProperty.newBuilder().setValue(pid) - .setQualifier(FieldTypeProtos.Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types")); - return pidSp.build(); - } - - public static FieldTypeProtos.StringField.Builder sf(final String s) { return FieldTypeProtos.StringField.newBuilder().setValue(s); } - - public static FieldTypeProtos.StructuredProperty.Builder getStruct(final String value, final FieldTypeProtos.Qualifier.Builder qualifier) { - return FieldTypeProtos.StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier); - } - - public static FieldTypeProtos.Qualifier.Builder getQualifier(final String classname, final String schemename) { - return FieldTypeProtos.Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); - } - - public static OafProtos.OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) { - final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setId(id).setType(type); - return entity; - } - -} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java deleted file mode 100644 index 2281870..0000000 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java +++ /dev/null @@ -1,208 +0,0 @@ -package eu.dnetlib.pace; - -import com.google.common.collect.Lists; -import com.google.gson.Gson; -import eu.dnetlib.data.proto.FieldTypeProtos.Author; -import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; -import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; -import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder; -import eu.dnetlib.data.proto.OafProtos.Oaf; -import eu.dnetlib.data.proto.OafProtos.OafEntity; -import eu.dnetlib.data.proto.OrganizationProtos.Organization; -import eu.dnetlib.data.proto.ResultProtos.Result; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.*; -import eu.dnetlib.pace.model.gt.GTAuthor; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.RandomStringUtils; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang3.RandomUtils; - -import java.io.IOException; -import java.io.StringWriter; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -public abstract class AbstractProtoPaceTest extends OafTest { - - protected DedupConfig getOrganizationCurrentConf() { - return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf")); - } - - protected DedupConfig getOrganizationTestConf() { - return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.test.conf")); - } - - protected MapDocument author(final Config conf, final String id, final Oaf oaf) { - return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model()); - } - - protected GTAuthor getGTAuthor(final String path) { - - final Gson gson = new Gson(); - - final String json = readFromClasspath(path); - - final GTAuthor gta = gson.fromJson(json, GTAuthor.class); - - return gta; - } - - protected String readFromClasspath(final String filename) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(getClass().getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } - - protected MapDocument result(final Config config, final String id, final String title) { - return result(config, id, title, null, new ArrayList<>(), null); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date) { - return result(config, id, title, date, new ArrayList<>(), null); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid) { - return result(config, id, title, date, pid, null); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) { - return result(config, id, title, date, pid, null); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List authors) { - return result(config, id, title, date, Lists.newArrayList(pid), authors); - } - - protected MapDocument author(final String identifier, final String area, final String firstname, final String lastname, final String fullname, final Double[] topics, final String pubID, final String pubDOI, final int rank, final String orcid, final List coauthors) { - Map fieldMap = new HashMap<>(); - - fieldMap.put("area", new FieldValueImpl(Type.String, "area", area)); - fieldMap.put("firstname", new FieldValueImpl(Type.String, "firstname", firstname)); - fieldMap.put("lastname", new FieldValueImpl(Type.String, "lastname", lastname)); - fieldMap.put("fullname", new FieldValueImpl(Type.String, "fullname", fullname)); - fieldMap.put("pubID", new FieldValueImpl(Type.String, "pubID", pubID)); - fieldMap.put("pubDOI", new FieldValueImpl(Type.String, "pubDOI", pubDOI)); - fieldMap.put("rank", new FieldValueImpl(Type.Int, "rank", rank)); - fieldMap.put("orcid", new FieldValueImpl(Type.String, "orcid", orcid)); - - FieldListImpl ca = new FieldListImpl("coauthors", Type.String); - ca.addAll(coauthors.stream().map(s -> new FieldValueImpl(Type.String, "coauthors", s)).collect(Collectors.toList())); - fieldMap.put("coauthors", ca); - - FieldListImpl t = new FieldListImpl("topics", Type.String); - t.addAll(Arrays.asList(topics).stream().map(d -> new FieldValueImpl(Type.String, "topics", d.toString())).collect(Collectors.toList())); - fieldMap.put("topics", t); - - return new MapDocument(identifier, fieldMap); - } - - static List pidTypes = Lists.newArrayList(); - static { - pidTypes.add("doi"); - //pidTypes.add("oai"); - //pidTypes.add("pmid"); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid, final List authors) { - final Result.Metadata.Builder metadata = Result.Metadata.newBuilder(); - if (!StringUtils.isBlank(title)) { - metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles"))); - metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles"))); - } - if (!StringUtils.isBlank(date)) { - metadata.setDateofacceptance(sf(date)); - } - - final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); - final Result.Builder result = Result.newBuilder().setMetadata(metadata); - - if (authors != null) { - result.getMetadataBuilder().addAllAuthor( - IntStream.range(0, authors.size()) - .mapToObj(i -> author(authors.get(i), i)) - .collect(Collectors.toCollection(LinkedList::new))); - } - - entity.setResult(result); - - if (pid != null) { - for(String p : pid) { - if (!StringUtils.isBlank(p)) { - entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1)))); - //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai")); - } - } - } - - final OafEntity build = entity.build(); - return ProtoDocumentBuilder.newInstance(id, build, config.model()); - } - - private Author author(final String s, int rank) { - final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false); - final Author.Builder author = Author.newBuilder(); - if (p.isAccurate()) { - author.setName(p.getNormalisedFirstName()); - author.setSurname(p.getNormalisedSurname()); - } - author.setFullname(p.getNormalisedFullname()); - author.setRank(rank); - - return author.build(); - } - - private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) { - final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type); - return entity; - } - - protected MapDocument organization(final Config config, final String id, final String legalName) { - return organization(config, id, legalName, null); - } - - protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) { - final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder(); - if (legalName != null) { - metadata.setLegalname(sf(legalName)); - } - if (legalShortName != null) { - metadata.setLegalshortname(sf(legalShortName)); - } - - final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); - entity.setOrganization(Organization.newBuilder().setMetadata(metadata)); - - return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model()); - } - - private StructuredProperty sp(final String pid, final String type) { - final Builder pidSp = - StructuredProperty.newBuilder().setValue(pid) - .setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types")); - return pidSp.build(); - } - - protected Field title(final String s) { - return new FieldValueImpl(Type.String, "title", s); - } - - protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) { - return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier); - } - - /* - * protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); } - * - * protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return - * Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); } - */ - -} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java index 4d1f0bb..4659ebe 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java @@ -5,7 +5,7 @@ import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.tree.support.TreeProcessor; import eu.dnetlib.pace.tree.support.TreeStats; -import eu.dnetlib.pace.utils.PaceUtils; +import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.support.ConnectedComponent; import org.apache.spark.api.java.JavaPairRDD; @@ -13,7 +13,6 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import scala.Tuple2; @@ -31,7 +30,7 @@ public class DedupLocalTest extends DedupTestUtils { @Before public void setup() { - config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.strict.conf", DedupLocalTest.class)); + config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.strict.conf.json", DedupLocalTest.class)); treeProcessor = new TreeProcessor(config); final SparkSession spark = SparkSession @@ -45,7 +44,6 @@ public class DedupLocalTest extends DedupTestUtils { } - @Ignore @Test public void dedupTest(){ @@ -59,7 +57,6 @@ public class DedupLocalTest extends DedupTestUtils { } - @Ignore @Test public void relationsTest() { @@ -115,15 +112,15 @@ public class DedupLocalTest extends DedupTestUtils { } - @Ignore + @Test public void matchTest(){ String JSONEntity1 = "{\"dateoftransformation\":\"2018-06-04\",\"originalId\":[\"opendoar____::Universiti_Sains_Malaysia\"],\"collectedfrom\":[{\"value\":\"OpenDOAR\",\"key\":\"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"websiteurl\":{\"value\":\"http://www.usm.my/my/\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Universiti Sains Malaysia\"},\"country\":{\"classid\":\"MY\",\"classname\":\"Malaysia\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2015-08-24\",\"type\":20,\"id\":\"20|opendoar____::04315c25b0eb56eacb967901557f86b1\"}"; String JSONEntity2 = "{\"dateoftransformation\":\"2019-10-07\",\"originalId\":[\"corda_______::997941627\"],\"collectedfrom\":[{\"value\":\"CORDA - COmmon Research DAta Warehouse\",\"key\":\"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"true\"},\"eclegalperson\":{\"value\":\"true\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"USM\"},\"ecresearchorganization\":{\"value\":\"true\"},\"ecnonprofit\":{\"value\":\"true\"},\"ecenterprise\":{\"value\":\"false\"},\"websiteurl\":{\"value\":\"http://www.usm.my/my\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"UNIVERSITI SAINS MALAYSIA*\"},\"country\":{\"classid\":\"MY\",\"classname\":\"Malaysia\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"true\"}}},\"dateofcollection\":\"2015-09-10\",\"type\":20,\"id\":\"20|corda_______::1fb0c86ddf389377454d5520d2796dad\"}"; - MapDocument mapDoc1 = PaceUtils.asMapDocument(config, JSONEntity1); - MapDocument mapDoc2 = PaceUtils.asMapDocument(config, JSONEntity2); + MapDocument mapDoc1 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity1); + MapDocument mapDoc2 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity2); TreeStats treeStats = treeProcessor.evaluateTree(mapDoc1, mapDoc2); @@ -131,12 +128,12 @@ public class DedupLocalTest extends DedupTestUtils { } - @Ignore + @Test public void parseJSONEntityTest(){ String jsonEntity = "{\"dateoftransformation\":\"2018-09-19\",\"originalId\":[\"doajarticles::Sociedade_Brasileira_de_Reumatologia\"],\"collectedfrom\":[{\"value\":\"DOAJ-Articles\",\"key\":\"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"country\":{\"classid\":\"BR\",\"classname\":\"Brazil\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2018-09-19\",\"type\":20,\"id\":\"20|doajarticles::0019ba7a22c5bc733c3206bde28ff568\"}"; - MapDocument mapDocument = PaceUtils.asMapDocument(config, jsonEntity); + MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, jsonEntity); System.out.println("mapDocument = " + mapDocument); } diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java deleted file mode 100644 index a23d6dd..0000000 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java +++ /dev/null @@ -1,71 +0,0 @@ -package eu.dnetlib.pace; - -import org.apache.oozie.client.OozieClient; -import org.apache.oozie.client.OozieClientException; -import org.apache.oozie.client.WorkflowJob; -import org.junit.Ignore; -import org.junit.Test; - -import java.io.IOException; -import java.util.Properties; - -import static junit.framework.Assert.assertEquals; - -public class DedupTestIT { - - @Ignore - @Test - public void deduplicationTest() throws OozieClientException, InterruptedException { - - //read properties to use in the oozie workflow - Properties prop = readProperties("/eu/dnetlib/test/properties/config.properties"); - - /*OOZIE WORKFLOW CREATION AND LAUNCH*/ - // get a OozieClient for local Oozie - OozieClient wc = new OozieClient("http://hadoop-edge3.garr-pa1.d4science.org:11000/oozie"); - - // create a workflow job configuration and set the workflow application path - Properties conf = wc.createConfiguration(); - conf.setProperty(OozieClient.APP_PATH, "hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/michele.debonis/oozieJob/workflow.xml"); - conf.setProperty(OozieClient.USER_NAME, "michele.debonis"); - conf.setProperty("oozie.action.sharelib.for.spark", "spark2"); - conf.setProperty("oozie.use.system.libpath", "true"); - - // setting workflow parameters - conf.setProperty("jobTracker", "hadoop-rm3.garr-pa1.d4science.org:8032"); - conf.setProperty("nameNode", "hdfs://hadoop-rm1.garr-pa1.d4science.org:8020"); - conf.setProperty("dedupConfiguration", prop.getProperty("dedup.configuration")); - conf.setProperty("inputSpace", prop.getProperty("input.space")); - conf.setProperty("outputPath", prop.getProperty("output")); - conf.setProperty("statisticsPath", prop.getProperty("dedup.statistics")); - - // submit and start the workflow job - String jobId = wc.run(conf); - System.out.println("Workflow job submitted"); - - // wait until the workflow job finishes printing the status every 10 seconds - while (wc.getJobInfo(jobId).getStatus() == WorkflowJob.Status.RUNNING) { - System.out.println(wc.getJobInfo(jobId));; - Thread.sleep(10 * 1000); - } - - // print the final status of the workflow job - System.out.println(wc.getJobInfo(jobId)); -// System.out.println("JOB LOG = " + wc.getJobLog(jobId)); - - assertEquals(WorkflowJob.Status.SUCCEEDED, wc.getJobInfo(jobId).getStatus()); - - } - - static Properties readProperties(final String propFile) { - - Properties prop = new Properties(); - try { - prop.load(DedupTestIT.class.getResourceAsStream(propFile)); - } catch (IOException e) { - e.printStackTrace(); - } - return prop; - } - -} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java deleted file mode 100644 index 4c6604f..0000000 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java +++ /dev/null @@ -1,446 +0,0 @@ -package eu.dnetlib.pace; - -import com.google.protobuf.GeneratedMessage; -import com.google.protobuf.InvalidProtocolBufferException; -//import eu.dnetlib.data.mapreduce.util.OafDecoder; -import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization; -import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization.Provision; -import eu.dnetlib.data.proto.DatasourceProtos.Datasource; -import eu.dnetlib.data.proto.DedupProtos.Dedup; -import eu.dnetlib.data.proto.FieldTypeProtos.*; -import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder; -import eu.dnetlib.data.proto.KindProtos.Kind; -import eu.dnetlib.data.proto.OafProtos.Oaf; -import eu.dnetlib.data.proto.OafProtos.OafEntity; -import eu.dnetlib.data.proto.OafProtos.OafRel; -import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization; -import eu.dnetlib.data.proto.OrganizationProtos.Organization; -import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization; -import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization.Participation; -import eu.dnetlib.data.proto.ProjectProtos.Project; -import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata; -import eu.dnetlib.data.proto.RelTypeProtos.RelType; -import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; -import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject; -import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject.Outcome; -import eu.dnetlib.data.proto.ResultProtos.Result; -import eu.dnetlib.data.proto.ResultProtos.Result.Context; -import eu.dnetlib.data.proto.ResultProtos.Result.Instance; -import eu.dnetlib.data.proto.ResultResultProtos.ResultResult; -import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.Similarity; -import eu.dnetlib.data.proto.TypeProtos.Type; - -public class OafTest { - - public static final String CITATION_JSON = - "\n \n [10] M. Foret et al., Phys. Rev. B 66, 024204 (2002).\n \n \n [11] B. Ru\175404\264e et al., Phys. Rev. Lett. 90, 095502 (2003).\n \n \n [12] U. Buchenau et al., Phys. Rev. B 34, 5665 (1986).\n \n \n [13] S.N. Taraskin and S.R. Elliott, J. Phys.: Condens. Mat- ter 11, A219 (1999).\n \n \n [14] B. Hehlen et al., Phys. Rev. Lett. 84, 5355 (2000).\n \n \n [15] N.V. Surotsev et al., J. Phys.: Condens. Matter 10, L113 (1998).\n \n \n [16] D.A. Parshin and C. Laermans, Phys. Rev. B 63, 132203 (2001).\n \n \n [17] V.L. Gurevich et al., Phys. Rev. B 67, 094203 (2003).\n \n \n [18] A. Matic et al., Phys. Rev. Lett. 86, 3803 (2001).\n \n \n [19] E. Rat et al., arXiv:cond-mat/0505558, 23 May 2005.\n \n \n [1] R.C. Zeller and R.O. Pohl, Phys. Rev. B 4, 2029 (1971).\n \n \n [20] C.A. Angell, J. Non-Cryst. Solids 131\20023133, 13 (1991).\n \n \n [21] A.P. Sokolov et al., Phys. Rev. Lett. 71, 2062 (1993).\n \n \n [22] T. Matsuo et al., Solid State Ionics 154-155, 759 (2002).\n \n \n [23] V.K. Malinovsky et al., Europhys. Lett. 11, 43 (1990).\n \n \n [24] J. Lor\250osch et al., J. Non-Cryst. Solids 69, 1 (1984).\n \n \n [25] U. Buchenau, Z. Phys. B 58, 181 (1985).\n \n \n [26] A.F. Io\175400e and A.R. Regel, Prog. Semicond. 4, 237 (1960).\n \n \n [27] R. Dell\20031Anna et al., Phys. Rev. Lett. 80, 1236 (1998).\n \n \n [28] D. Fioretto et al., Phys. Rev. E 59, 4470 (1999).\n \n \n [29] U. Buchenau et al., Phys. Rev. Lett. 77, 4035 (1996).\n \n \n [2] M. Rothenfusser et al., Phys. Rev. B 27, 5196 (1983).\n \n \n [30] J. Mattsson et al., J. Phys.: Condens. Matter 15, S1259 (2003).\n \n \n [31] T. Scopigno et al., Phys. Rev. Lett. 92, 025503 (2004).\n \n \n [32] M. Foret et al., Phys. Rev. Lett. 81, 2100 (1998).\n \n \n [33] F. Sette et al., Science 280, 1550 (1998).\n \n \n [34] J. Wuttke et al., Phys. Rev. E 52, 4026 (1995).\n \n \n [35] M.A. Ramos et al., Phys. Rev. Lett. 78, 82 (1997).\n \n \n [36] G. Monaco et al., Phys. Rev. Lett. 80, 2161 (1998).\n \n \n [37] A. T\250olle, Rep. Prog. Phys. 64, 1473 (2001).\n \n \n [38] As the straight lines do not cross the origin, this does not 2 imply \1623 \21035 \1651 .\n \n \n [39] A. Matic et al., Europhys. Lett. 54, 77 (2001).\n \n \n [3] S. Hunklinger and W. Arnold, in Physical Acoustics, Vol. XII, W.P. Mason and R.N. Thurston Eds. (Academic Press, N.Y. 1976), p. 155.\n \n \n [40] IXS data are usually not available below \1651co, mostly for experimental reasons. E.g., that the rapid onset was not evidenced in vitreous silica [27], is not indicative of its absence but rather of a low qco \21074 1 nm\210221.\n \n \n [41] G. Ruocco et al., Phys. Rev. Lett. 83, 5583 (1999).\n \n \n [42] D. C\1307 iplys et al., J. Physique (Paris) 42, C6-184 (1981).\n \n \n [43] R. Vacher et al., Rev. Sci. Instrum. 51, 288 (1980).\n \n \n [44] R. Vacher et al., arXiv:cond-mat/0505560, 23 May 2005.\n \n \n [45] T.N. Claytor et al., Phys. Rev. B 18, 5842 (1978).\n \n \n [46] M. Arai et al., Physica B 263-264, 268 (1999).\n \n \n [4] R. Vacher et al., J. Non-Cryst. Solids 45, 397 (1981); T.C. Zhu et al., Phys. Rev. B 44, 4281 (1991).\n \n \n [5] J.E. Graebner et al., Phys. Rev. B 34, 5696 (1986).\n \n \n [6] E. Duval and A. Mermet, Phys. Rev. B 58, 8159 (1998).\n \n \n [7] A. Matic et al., Phys. Rev. Lett. 93, 145502 (2004).\n \n \n [8] Often alluded to, e.g. in the Encyclopedia of Materials: Science and Technology, K.H.J. Buschow et al., Eds., Vol. 1 (Elsevier, Oxford, 2001), articles by S.R. Elliott on pp. 171-174 and U. Buchenau on pp. 212-215.\n \n \n [9] E. Rat et al., Phys. Rev. Lett. 83, 1355 (1999).\n \n"; - - public static final String STATISTICS_JSON = - "[{ \"citationsPerYear\": \"many\", \"anotherCoolStatistic\": \"WoW\", \"nestedStat\": { \"firstNestedStat\" : \"value 1\", \"secondNestedStat\" : \"value 2\"}, \"listingStat\" : [ \"one\", \"two\" ] }]"; - - public static Builder getStructuredproperty(final String value, final String classname, final String schemename) { - return getStructuredproperty(value, classname, schemename, null); - } - - public static Builder getStructuredproperty(final String value, final String classname, final String schemename, final DataInfo dataInfo) { - final Builder sp = StructuredProperty.newBuilder().setValue(value).setQualifier(getQualifier(classname, schemename)); - if (dataInfo != null) { - sp.setDataInfo(dataInfo); - } - return sp; - } - - public static Qualifier.Builder getQualifier(final String classname, final String schemename) { - return Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); - } - - public static KeyValue getKV(final String id, final String name) { - return KeyValue.newBuilder().setKey(id).setValue(name).build(); - } - - public static OafEntity getDatasource(final String datasourceId) { - return OafEntity - .newBuilder() - .setType(Type.datasource) - .setId(datasourceId) - .setDatasource( - Datasource.newBuilder().setMetadata( - Datasource.Metadata.newBuilder().setOfficialname(sf("officialname")).setEnglishname(sf("englishname")) - .setWebsiteurl(sf("websiteurl")).setContactemail(sf("contactemail")).addAccessinfopackage(sf("accessinforpackage")) - .setNamespaceprefix(sf("namespaceprofix")).setDescription(sf("description")).setOdnumberofitems(sf("numberofitems")) - .setOdnumberofitemsdate(sf("numberofitems date")) - // .addOdsubjects("subjects") - .setOdpolicies(sf("policies")).addOdlanguages(sf("languages")).addOdcontenttypes(sf("contenttypes")) - .setDatasourcetype(getQualifier("type class", "type scheme")))).build(); - } - - public static OafEntity getResult(final String id) { - return getResultBuilder(id).build(); - } - - public static OafEntity.Builder getResultBuilder(final String id) { - return OafEntity - .newBuilder() - .setType(Type.result) - .setId(id) - .setResult( - Result.newBuilder() - .setMetadata( - Result.Metadata - .newBuilder() - .addTitle( - getStructuredproperty( - "Analysis of cell viability in intervertebral disc: Effect of endplate permeability on cell population", - "main title", "dnet:result_titles", getDataInfo())) - .addTitle(getStructuredproperty("Another title", "alternative title", "dnet:result_titles", getDataInfo())) - .addSubject(getStructuredproperty("Biophysics", "subject", "dnet:result_sujects")) - .setDateofacceptance(sf("2010-01-01")).addSource(sf("sourceA")).addSource(sf("sourceB")) - .addContext(Context.newBuilder().setId("egi::virtual::970")) - .addContext(Context.newBuilder().setId("egi::classification::natsc::math::applied")) - .addContext(Context.newBuilder().setId("egi::classification::natsc::math")) - .addContext(Context.newBuilder().setId("egi::classification::natsc")) - .addContext(Context.newBuilder().setId("egi::classification")).addContext(Context.newBuilder().setId("egi")) - .addDescription(sf("Responsible for making and maintaining the extracellular matrix ...")) - .addDescription(sf("Another description ...")).setPublisher(sf("ELSEVIER SCI LTD")) - .setResulttype(getQualifier("publication", "dnet:result_types")) - .setLanguage(getQualifier("eng", "dnet:languages"))).addInstance(getInstance("10|od__10", "Uk pubmed")) - .addInstance(getInstance("10|od__10", "arxiv"))) - .addCollectedfrom(getKV("opendoar____::1064", "Oxford University Research Archive")) - .addPid(getStructuredproperty("doi:74293", "doi", "dnet:pids")).addPid(getStructuredproperty("oai:74295", "oai", "dnet:pids")) - .setDateofcollection(""); - } - - public static DataInfo getDataInfo() { - return getDataInfo("0.4"); - } - - public static DataInfo getDataInfo(final String trust) { - return DataInfo.newBuilder().setDeletedbyinference(false).setTrust("0.4").setInferenceprovenance("algo").setProvenanceaction(getQualifier("xx", "yy")) - .build(); - } - - public static Instance.Builder getInstance(final String hostedbyId, final String hostedbyName) { - return Instance.newBuilder().setHostedby(getKV(hostedbyId, hostedbyName)).setAccessright(getQualifier("OpenAccess", "dnet:access_modes")) - .setInstancetype(getQualifier("publication", "dnet:result_typologies")).addUrl("webresource url"); - - } - - public static OafRel getDedupRel(final String source, final String target, final RelType relType, final String relClass) { - return OafRel.newBuilder().setSource(source).setTarget(target).setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass) - .setChild(false).setCachedTarget(getResult(target)) - .setResultResult(ResultResult.newBuilder().setDedup(Dedup.newBuilder().setRelMetadata(RelMetadata.getDefaultInstance()))) - .build(); - } - - public static OafRel getProjectOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException { - final OafRel.Builder oafRel = OafRel - .newBuilder() - .setSource(source) - .setTarget(target) - .setRelType(RelType.projectOrganization) - .setSubRelType(SubRelType.participation) - .setRelClass(relClass) - .setChild(false) - .setProjectOrganization( - ProjectOrganization.newBuilder().setParticipation( - Participation.newBuilder().setParticipantnumber("" + 1) - .setRelMetadata(relMetadata(relClass, "dnet:project_organization_relations")))); - switch (Participation.RelName.valueOf(relClass)) { - case hasParticipant: - oafRel.setCachedTarget(getProjectFP7(target, "SP3")); - break; - case isParticipant: - oafRel.setCachedTarget(getOrganization(target)); - break; - default: - break; - } - return oafRel.build(); - } - - public static GeneratedMessage getOrganizationOrganization(final String source, final String target, final String relClass) { - final OafRel.Builder oafRel = OafRel - .newBuilder() - .setSource(source) - .setTarget(target) - .setRelType(RelType.organizationOrganization) - .setSubRelType(SubRelType.dedup) - .setRelClass(relClass) - .setChild(true) - .setOrganizationOrganization( - OrganizationOrganization.newBuilder().setDedup( - Dedup.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:organization_organization_relations")))); - - switch (Dedup.RelName.valueOf(relClass)) { - case isMergedIn: - oafRel.setCachedTarget(getOrganization(source)); - break; - case merges: - oafRel.setCachedTarget(getOrganization(target)); - break; - default: - break; - } - return oafRel.build(); - } - - public static OafRel getDatasourceOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException { - final OafRel.Builder oafRel = OafRel - .newBuilder() - .setSource(source) - .setTarget(target) - .setRelType(RelType.datasourceOrganization) - .setSubRelType(SubRelType.provision) - .setRelClass(relClass) - .setChild(false) - .setDatasourceOrganization( - DatasourceOrganization.newBuilder().setProvision( - Provision.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:datasource_organization_relations")))); - switch (Provision.RelName.valueOf(relClass)) { - case isProvidedBy: - oafRel.setCachedTarget(getOrganization(target)); - break; - case provides: - oafRel.setCachedTarget(getDatasource(target)); - break; - default: - break; - } - return oafRel.build(); - } - - public static OafRel getSimilarityRel(final String sourceId, final String targetId, final OafEntity result, final String relClass) { - return OafRel - .newBuilder() - .setSource(sourceId) - .setTarget(targetId) - .setRelType(RelType.resultResult) - .setSubRelType(SubRelType.similarity) - .setRelClass(relClass) - .setChild(false) - .setCachedTarget(result) - .setResultResult( - ResultResult.newBuilder().setSimilarity( - Similarity.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:resultResult_relations")).setSimilarity(.4f) - .setType(Similarity.Type.STANDARD))).build(); - } - - public static RelMetadata.Builder relMetadata(final String classname, final String schemename) { - return RelMetadata.newBuilder().setSemantics(getQualifier(classname, schemename)); - } - - public static OafEntity getOrganization(final String orgId) { - return OafEntity - .newBuilder() - .setType(Type.organization) - .setId(orgId) - .addCollectedfrom(getKV("opendoar_1234", "UK pubmed")) - .setOrganization( - Organization.newBuilder().setMetadata( - Organization.Metadata.newBuilder().setLegalname(sf("CENTRE D'APPUI A LA RECHERCHE ET A LA FORMATION GIE")) - .setLegalshortname(sf("CAREF")).setWebsiteurl(sf("www.caref-mali.org")) - .setCountry(getQualifier("ML", "dnet:countries")))).build(); - } - - public static OafRel getResultProject(final String from, final String to, final OafEntity project, final String relClass) - throws InvalidProtocolBufferException { - return OafRel - .newBuilder() - .setSource(from) - .setTarget(to) - .setRelType(RelType.resultProject) - .setSubRelType(SubRelType.outcome) - .setRelClass(relClass) - .setChild(false) - .setResultProject( - ResultProject.newBuilder().setOutcome(Outcome.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:result_project_relations")))) - .setCachedTarget(project).build(); - } - - public static OafEntity getProjectFP7(final String projectId, final String fundingProgram) throws InvalidProtocolBufferException { - return OafEntity - .newBuilder() - .setType(Type.project) - .setId(projectId) - .addCollectedfrom(getKV("opendoar_1234", "UK pubmed")) - .setProject( - Project.newBuilder() - .setMetadata( - Project.Metadata - .newBuilder() - .setAcronym(sf("5CYRQOL")) - .setTitle(sf("Cypriot Researchers Contribute to our Quality of Life")) - .setStartdate(sf("2007-05-01")) - .setEnddate(sf("2007-10-31")) - .setEcsc39(sf("false")) - .setContracttype(getQualifier("CSA", "ec:FP7contractTypes")) - .addFundingtree( - sf("ec__________::ECECEuropean Commissionec__________::EC::FP7::" - + fundingProgram - + "::PEOPLEMarie-Curie ActionsPEOPLEec:programec__________::EC::FP7::" - + fundingProgram - + "" - + fundingProgram - + "-People" - + fundingProgram - + "ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram")))) - .build(); - } - - public static OafEntity getProjectWT() throws InvalidProtocolBufferException { - return OafEntity - .newBuilder() - .setType(Type.project) - .setId("project|wt::087536") - .addCollectedfrom(getKV("wellcomeTrust", "wellcome trust")) - .setProject( - Project.newBuilder() - .setMetadata( - Project.Metadata - .newBuilder() - .setAcronym(sf("UNKNOWN")) - .setTitle(sf("Research Institute for Infectious Diseases of Poverty (IIDP).")) - .setStartdate(sf("2007-05-01")) - .setEnddate(sf("2007-10-31")) - .setEcsc39(sf("false")) - .setContracttype(getQualifier("UNKNOWN", "wt:contractTypes")) - .addFundingtree( - sf("wt__________::WTWTWellcome Trustwt__________::WT::UNKNOWNUNKNOWNUNKNOWNwt:fundingStream")) - .addFundingtree( - sf("wt__________::WTWTWellcome Trustwt__________::WT::Technology TransferTechnology TransferTechnology Transferwt:fundingStream")))) - .build(); - } - - public static ExtraInfo extraInfo(final String name, final String provenance, final String trust, final String typology, final String value) { - final ExtraInfo.Builder e = ExtraInfo.newBuilder().setName(name).setProvenance(provenance).setTrust(trust).setTypology(typology).setValue(value); - return e.build(); - } - - // public static DocumentClasses documentClasses() { - // DocumentClasses.Builder builder = DocumentClasses.newBuilder(); - // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASSES) + 1; i++) { - // builder.addArXivClasses(getDocumentClass()).addDdcClasses(getDocumentClass()).addWosClasses(getDocumentClass()) - // .addMeshEuroPMCClasses(getDocumentClass()); - // } - // return builder.build(); - // } - // - // private static DocumentClass getDocumentClass() { - // DocumentClass.Builder builder = DocumentClass.newBuilder(); - // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASS_LABELS) + 1; i++) { - // builder.addClassLabels("test_class_" + i); - // } - // return builder.setConfidenceLevel(0.5F).build(); - // } - // - // public static DocumentStatistics documentStatistics() { - // return - // DocumentStatistics.newBuilder().setCitationsFromAllPapers(basicCitationStatistics()).setCitationsFromPublishedPapers(basicCitationStatistics()) - // .build(); - // } - // - // private static BasicCitationStatistics basicCitationStatistics() { - // BasicCitationStatistics.Builder builder = BasicCitationStatistics.newBuilder(); - // for (int i = 0; i < N_CITATION_STATS; i++) { - // builder.addNumberOfCitationsPerYear(statisticsKeyValue()); - // builder.setNumberOfCitations(RandomUtils.nextInt(5) + 1); - // } - // return builder.build(); - // } - // - // private static StatisticsKeyValue statisticsKeyValue() { - // return StatisticsKeyValue.newBuilder().setKey((RandomUtils.nextInt(30) + 1980) + "").setValue(RandomUtils.nextInt(5) + 1).build(); - // } - // - // public static AuthorStatistics authorStatistics() { - // AuthorStatistics.Builder builder = AuthorStatistics.newBuilder(); - // builder.setCore(commonCoreStatistics()); - // for (int i = 0; i < N_COAUTHORS; i++) { - // builder.addCoAuthors(coAuthor()); - // } - // return builder.build(); - // } - // - // private static CoAuthor coAuthor() { - // CoAuthor.Builder builder = CoAuthor.newBuilder(); - // builder.setId("30|od______2345::" + Hashing.md5(RandomStringUtils.random(10))); - // builder.setCoauthoredPapersCount(RandomUtils.nextInt(5) + 1); - // return builder.build(); - // } - // - // public static CommonCoreStatistics commonCoreStatistics() { - // CommonCoreStatistics.Builder builder = CommonCoreStatistics.newBuilder(); - // - // builder.setAllPapers(coreStatistics()); - // builder.setPublishedPapers(coreStatistics()); - // - // return builder.build(); - // } - // - // private static CoreStatistics coreStatistics() { - // CoreStatistics.Builder builder = CoreStatistics.newBuilder(); - // - // builder.setNumberOfPapers(RandomUtils.nextInt(10)); - // builder.setCitationsFromAllPapers(extendedStatistics()); - // builder.setCitationsFromPublishedPapers(extendedStatistics()); - // - // return builder.build(); - // } - // - // private static ExtendedStatistics extendedStatistics() { - // ExtendedStatistics.Builder builder = ExtendedStatistics.newBuilder(); - // - // builder.setBasic(basicCitationStatistics()); - // builder.setAverageNumberOfCitationsPerPaper(RandomUtils.nextFloat()); - // for (int i = 0; i < N_CITATION_STATS; i++) { - // builder.addNumberOfPapersCitedAtLeastXTimes(statisticsKeyValue()); - // } - // - // return builder.build(); - // } - - public static StringField sf(final String s) { - return sf(s, null); - } - - public static StringField sf(final String s, final DataInfo dataInfo) { - final StringField.Builder sf = StringField.newBuilder().setValue(s); - if (dataInfo != null) { - sf.setDataInfo(dataInfo); - } - return sf.build(); - } - -// public static OafDecoder embed(final GeneratedMessage msg, -// final Kind kind, -// final boolean deletedByInference, -// final boolean inferred, -// final String provenance, -// final String action) { -// -// final Oaf.Builder oaf = Oaf -// .newBuilder() -// .setKind(kind) -// .setLastupdatetimestamp(System.currentTimeMillis()) -// .setDataInfo( -// DataInfo.newBuilder().setDeletedbyinference(deletedByInference).setInferred(inferred).setTrust("0.5") -// .setInferenceprovenance(provenance).setProvenanceaction(getQualifier(action, action))); -// switch (kind) { -// case entity: -// oaf.setEntity((OafEntity) msg); -// break; -// case relation: -// oaf.setRel((OafRel) msg); -// break; -// default: -// break; -// } -// -// return OafDecoder.decode(oaf.build()); -// } -// -// public static OafDecoder embed(final GeneratedMessage msg, final Kind kind) { -// return embed(msg, kind, false, false, "inference_provenance", "provenance_action"); -// } - -} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java index 85aa6da..a36eefa 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java @@ -1,41 +1,39 @@ package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.AbstractProtoPaceTest; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; import eu.dnetlib.pace.model.MapDocument; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.junit.Before; import org.junit.Test; -public class ClusteringCombinerTest extends AbstractProtoPaceTest { +public class ClusteringCombinerTest { - private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class); - private Config config; - - @Before - public void setUp() { - config = getOrganizationTestConf(); - } - - @Test - public void testCombine() { - - final MapDocument organization = organization(config, "A", "University of Turin", "UNITO"); - log.info("University of Turin"); - log.info(ClusteringCombiner.combine(organization, config)); - } - - @Test - public void testCombineBlacklistAware() { - - final MapDocument organization = organization(config, "A", "University of Turin", "UNITO"); - log.info("University of Turin"); - log.info(BlacklistAwareClusteringCombiner.filterAndCombine(organization, config)); - } + // TODO RE IMPLEMENT Tests with the new configuration +// private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class); +// +// private Config config; +// +// @Before +// public void setUp() { +// config = getOrganizationTestConf(); +// } +// +// @Test +// public void testCombine() { +// +// final MapDocument organization = organization(config, "A", "University of Turin", "UNITO"); +// log.info("University of Turin"); +// log.info(ClusteringCombiner.combine(organization, config)); +// } +// +// @Test +// public void testCombineBlacklistAware() { +// +// final MapDocument organization = organization(config, "A", "University of Turin", "UNITO"); +// log.info("University of Turin"); +// log.info(BlacklistAwareClusteringCombiner.filterAndCombine(organization, config)); +// } } diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java deleted file mode 100644 index 57278ca..0000000 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java +++ /dev/null @@ -1,46 +0,0 @@ -package eu.dnetlib.pace.model; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; -import com.google.common.collect.Sets.SetView; -import eu.dnetlib.pace.AbstractProtoPaceTest; -import eu.dnetlib.pace.config.Config; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.junit.Test; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest { - - private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class); - - @Test - public void test_serialise1() { - - final String id = "12345"; - - final Config config = getOrganizationTestConf(); - - final MapDocument document = ProtoDocumentBuilder.newInstance(id, getOrganization(id), config.model()); - - assertFalse(document.fieldNames().isEmpty()); - assertFalse(Iterables.isEmpty(document.fields())); - - log.info("original:\n" + document); - - final String stringDoc = MapDocumentSerializer.toString(document); - - log.info("serialization:\n" + stringDoc); - - final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes()); - - final SetView diff = Sets.difference(document.fieldNames(), decoded.fieldNames()); - - assertTrue(diff.isEmpty()); - - log.info("decoded:\n" + decoded); - } - -} diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json similarity index 98% rename from dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf rename to dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json index 9e4eb6f..d545f5b 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json @@ -7,6 +7,7 @@ "queueMaxSize" : "2000", "groupMaxSize" : "50", "slidingWindowSize" : "200", + "idPath":"$.id", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true", "maxIterations": "20" @@ -185,12 +186,12 @@ } }, "model" : [ - { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, - { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, - { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, - { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, - { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}, - { "name" : "originalId", "type" : "String", "path" : "id" } + { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, + { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"}, + { "name" : "originalId", "type" : "String", "path" : "$.id" } ], "blacklists" : { "legalname" : [] @@ -301,7 +302,7 @@ "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"], - "key::106" : ["seminary", "seminario", "seminaire", "seminar"] + "key::106" : ["seminary", "seminario", "seminaire", "seminar"], "key::107" : ["agricultural forestry", "af", "a f", "a&f"], "key::108" : ["agricultural mechanical", "am", "a m", "a&m"] } diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.test.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.test.conf index 6697157..280c16d 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.test.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.test.conf @@ -7,6 +7,7 @@ "queueMaxSize" : "2000", "groupMaxSize" : "50", "slidingWindowSize" : "200", + "idPath":"$.id", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true", "maxIterations": "20" @@ -24,11 +25,13 @@ "layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"} }, "model" : [ - { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, - { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, - { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, - { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, - { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"} + + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : ".organization.metadata.country.classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : ".organization.metadata.legalshortname.value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : ".organization.metadata.legalname.value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : ".organization.metadata.websiteurl.value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".pid[] | select(.qualifier.classid == \"grid\") | .value" } + ], "blacklists" : { "legalname" : ["University of Turin"] diff --git a/dnet-dedup.ipr b/dnet-dedup.ipr deleted file mode 100644 index dc3974c..0000000 --- a/dnet-dedup.ipr +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dnet-dedup.iws b/dnet-dedup.iws deleted file mode 100644 index 57de9a0..0000000 --- a/dnet-dedup.iws +++ /dev/null @@ -1,418 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index a34ed9f..5c6466d 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -38,10 +38,6 @@ commons-collections commons-collections - - com.googlecode.protobuf-java-format - protobuf-java-format - org.antlr stringtemplate @@ -59,22 +55,22 @@ org.reflections reflections - com.fasterxml.jackson.core jackson-databind - - - org.codehaus.jackson - jackson-mapper-asl - - org.apache.commons commons-math3 + + com.jayway.jsonpath + json-path + + + + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index bcfa5a1..261e13b 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -1,25 +1,25 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.util.PaceException; +import org.antlr.stringtemplate.StringTemplate; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.function.BiFunction; + import eu.dnetlib.pace.tree.support.TreeNodeDef; -import eu.dnetlib.pace.util.PaceException; -import org.antlr.stringtemplate.StringTemplate; -import org.apache.commons.io.IOUtils; -import com.google.common.collect.Maps; - -import eu.dnetlib.pace.model.ClusteringDef; -import eu.dnetlib.pace.model.FieldDef; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class DedupConfig implements Config, Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 7a87a82..3397110 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,12 +1,13 @@ package eu.dnetlib.pace.config; + +import com.fasterxml.jackson.annotation.JsonIgnore; import com.google.common.collect.Maps; import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceResolver; -import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; import java.util.List; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java index d2722ac..a79d234 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -1,17 +1,17 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.lang.StringUtils; + import java.io.IOException; import java.io.Serializable; import java.util.HashSet; import java.util.List; import java.util.Set; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import com.google.gson.GsonBuilder; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang.StringUtils; -import org.codehaus.jackson.map.ObjectMapper; public class WfConfig implements Serializable { @@ -76,12 +76,17 @@ public class WfConfig implements Serializable { /** Maximum number of allowed children. */ private int maxChildren = MAX_CHILDREN; + /** Default maximum number of iterations. */ private final static int MAX_ITERATIONS = 20; /** Maximum number of iterations */ private int maxIterations = MAX_ITERATIONS; + /** The Jquery path to retrieve the identifier */ + private String idPath = "$.id"; + + public WfConfig() {} /** @@ -252,6 +257,7 @@ public class WfConfig implements Serializable { this.maxChildren = maxChildren; } + public int getMaxIterations() { return maxIterations; } @@ -260,6 +266,15 @@ public class WfConfig implements Serializable { this.maxIterations = maxIterations; } + public String getIdPath() { + return idPath; + } + + public void setIdPath(String idPath) { + this.idPath = idPath; + + } + /* * (non-Javadoc) * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index d2dab04..c15885e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -1,19 +1,15 @@ package eu.dnetlib.pace.model; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.pace.clustering.ClusteringFunction; +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.util.PaceException; + import java.io.IOException; import java.io.Serializable; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Map; -import eu.dnetlib.pace.clustering.*; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.util.PaceException; -import eu.dnetlib.pace.util.PaceResolver; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class ClusteringDef implements Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java index 0d08fdd..055eaaf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java @@ -1,7 +1,8 @@ package eu.dnetlib.pace.tree.support; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java index 9accded..b1341fc 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java @@ -1,8 +1,9 @@ package eu.dnetlib.pace.tree.support; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index cb3b7b4..57552e6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -1,10 +1,11 @@ package eu.dnetlib.pace.tree.support; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java index 36188e3..186e8d1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java @@ -1,7 +1,7 @@ package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.util.HashMap; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 2dfa9ae..bc846e7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -36,6 +36,18 @@ public class BlockProcessor { this.dedupConf = dedupConf; } + + public void processSortedBlock(final String key, final List documents, final Reporter context) { + if (documents.size() > 1) { +// log.info("reducing key: '" + key + "' records: " + q.size()); + //process(q, context); + process(prepare(documents), context); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + public void process(final String key, final Iterable documents, final Reporter context) { final Queue q = prepare(documents); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java new file mode 100644 index 0000000..2014173 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -0,0 +1,109 @@ +package eu.dnetlib.pace.util; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.MapDocument; +import net.minidev.json.JSONArray; + +import java.util.*; +import java.util.function.Predicate; + +public class MapDocumentUtil { + + + private static final ObjectMapper mapper = new ObjectMapper(); + public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; + public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); + + + + public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { + MapDocument m = new MapDocument(); + m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); + Map stringField = new HashMap<>(); + conf.getPace().getModel().forEach(fdef -> { + switch (fdef.getType()) { + case String: + case Int: + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), getJPathString(fdef.getPath(), json))); + break; + case URL: + String uv = getJPathString(fdef.getPath(), json); + if (!urlFilter.test(uv)) uv = ""; + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); + break; + case List: + case JSON: + FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); + getJPathList(fdef.getPath(), json, fdef.getType()) + .stream() + .map(item -> new FieldValueImpl(fdef.getType(), fdef.getName(), item)) + .forEach(fi::add); + stringField.put(fdef.getName(), fi); + break; + } + }); + m.setFieldMap(stringField); + return m; + } + + public static List getJPathList(String path, String json, Type type) { + if (type == Type.List) + return JsonPath.read(json, path); + Object jresult; + List result = new ArrayList<>(); + try { + jresult = JsonPath.read(json, path); + } catch (Throwable e) { + return result; + } + if (jresult instanceof JSONArray) { + + ((JSONArray) jresult).forEach(it -> { + + try { + result.add(new ObjectMapper().writeValueAsString(it)); + } catch (JsonProcessingException e) { + + } + } + ); + return result; + } + + if (jresult instanceof LinkedHashMap) { + try { + result.add(new ObjectMapper().writeValueAsString(jresult)); + } catch (JsonProcessingException e) { + + } + return result; + } + if (jresult instanceof String) { + result.add((String) jresult); + } + return result; + } + + + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String)o; + if (o instanceof JSONArray && ((JSONArray)o).size()>0) + return (String)((JSONArray)o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } + + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 3b87ced..cd553ba 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -1,6 +1,10 @@ package eu.dnetlib.pace.config; + import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; import org.junit.Test; import java.util.Map; @@ -57,4 +61,27 @@ public class ConfigTest extends AbstractPaceTest { assertEquals(0, load.getPace().translationMap().keySet().size()); } + + + @Test + public void testAsMapDocumentJPath() throws Exception { + + DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf_jpath.json")); + + + System.out.println(load.getWf().getIdPath()); + + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + + System.out.println(result); + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(load, result); + + System.out.println(mapDocument.getFieldMap()); + + } + + + + + } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json new file mode 100644 index 0000000..a24be24 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json @@ -0,0 +1 @@ +{"kind": "entity","entity": {"type": "result","result": {"metadata": {"subject": [{"value": "open access","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "infrastructure","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "data model","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "CERIF","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "DataCite","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}}],"title": [{"value": "The Data Model of the OpenAIRE Scientific Communication e-Infrastructure","qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"}}],"dateofacceptance": {"value": "2012-11-30"},"publisher": {"value": ""},"resulttype": {"classid": "publication","classname": "publication","schemeid": "dnet:result_typologies","schemename": "dnet:result_typologies"},"storagedate": {"value": "2012-11-30"},"resourcetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"size": {"value": ""},"version": {"value": ""},"description": [{"value": "The OpenAIREplus project aims to further develop and operate the OpenAIRE e-infrastructure, in order to provide a central entry point to Open Access and \\tnon-Open Access publications and datasets funded by the European Commission and National agencies. The infrastructure provides the services to populate, curate, and enrich an Information Space by collecting metadata descriptions relative to organizations, data sources, projects, funding programmes, persons, publications, and datasets. Stakeholders in the research process and\\t\\t\\t\\tscientific communication, such as researchers, funding agencies, organizations nvolved in projects, project coordinators, can here find the information to improve their research and statistics to measure the impact of Open Access and funding schemes over research. In this paper, we introduce the functional requirements to be satisfied and describe the OpenAIREplus data model entities and relationships required to represent information capable of meeting them."}],"license": [{"value": ""}],"author": [{"fullname": "Manghi, Paolo","name": "Paolo","surname": "Manghi","rank": 1},{"fullname": "Houssos, Nikos","name": "Nikos","surname": "Houssos","rank": 2,"pid": [{"key": "ORCID","value": "0000-0002-3748-8359"}]},{"fullname": "Mikulicic, Marko","name": "Marko","surname": "Mikulicic","rank": 3},{"fullname": "Jf6rg, Brigitte","name": "Brigitte","surname": "Jo\u0308rg","rank": 4}]},"instance": [{"accessright": {"classid": "OPEN","classname": "Open Access","schemeid": "dnet:access_modes","schemename": "dnet:access_modes"},"instancetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"hostedby": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"license": {"value": ""},"url": ["http://dx.doi.org/10.1007/978-3-642-35233-1_18"],"collectedfrom": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"dateofacceptance": {"value": "2012-11-30"},"distributionlocation": ""}]},"originalId": ["123456789/7","10.1007/978-3-642-35233-1_18"],"collectedfrom": [{"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"}],"pid": [{"value": "123456789/7","qualifier": {"classid": "handle","classname": "handle","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}},{"value": "10.1007/978-3-642-35233-1_18","qualifier": {"classid": "doi","classname": "doi","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}}],"dateofcollection": "2019-11-05T10:07:42.263Z","id": "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","dateoftransformation": "2019-11-06T17:11:47.505Z","oaiprovenance": {"originDescription": {"harvestDate": "2019-11-05T10:07:42.263Z","altered": true,"baseURL": "https%3A%2F%2Fdspace-cris.4science.cloud%2Foai%2Fopenairecris","identifier": "oai:dspace-cris.4science.cloud:Publications/123456789/7","datestamp": "2019-09-05T21:52:21Z","metadataNamespace": ""}}},"dataInfo": {"inferred": false,"deletedbyinference": false,"trust": "0.9","inferenceprovenance": "","provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "sysimport:crosswalk:datasetarchive","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"invisible": false}} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json new file mode 100644 index 0000000..96094b8 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json @@ -0,0 +1,48 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "idPath": "$.entity.id", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + ], + "decisionTree": {}, + "model" : [ + { "name" : "pid", "type" : "JSON", "path" : "$.entity.pid"}, + { "name" : "dateofacceptance", "type" : "String", "path" : "$.entity.result.metadata.dateofacceptance.value"}, + { "name" : "title", "type" : "String","path" : "$.entity.result.metadata.title[?(@.qualifier.classid ==\"main title\")].value" }, + { "name" : "authors", "type" : "List", "path" : "$.entity.result.metadata.author[*].fullname" } + ], + "blacklists" : { + "title" : [ + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*" + ] } , + "synonyms": {} + } + +} diff --git a/pom.xml b/pom.xml index fb5c6b8..536ba38 100644 --- a/pom.xml +++ b/pom.xml @@ -84,6 +84,16 @@ + + + central + Central Repository + http://repo.maven.apache.org/maven2 + + true + + + @@ -246,21 +256,6 @@ stringtemplate 3.2 - - com.googlecode.protobuf-java-format - protobuf-java-format - 1.2 - - - eu.dnetlib - dnet-openaire-data-protos - 3.9.3-proto250 - - - eu.dnetlib - dnet-openaireplus-mapping-utils - 6.2.21 - com.fasterxml.jackson.core @@ -269,10 +264,17 @@ - org.codehaus.jackson - jackson-mapper-asl - 1.9.13 + com.fasterxml.jackson.dataformat + jackson-dataformat-xml + ${jackson.version} + + com.fasterxml.jackson.module + jackson-module-jsonSchema + ${jackson.version} + + + org.apache.commons @@ -351,6 +353,12 @@ oozie-client 5.1.0 + + com.jayway.jsonpath + json-path + 2.4.0 + +