From 5a52aed8e104541203b485044de8adf572520b26 Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 13 Sep 2021 14:53:19 +0200 Subject: [PATCH] dedup test implementation & graph drawing tools --- .../test.properties | 2 +- dnet-dedup-test/job-override.properties | 7 +- dnet-dedup-test/pom.xml | 11 + .../src/main/java/eu/dnetlib/Deduper.java | 156 +------ .../dnetlib/jobs/SparkCreateDedupEntity.java | 2 +- .../eu/dnetlib/jobs/SparkCreateSimRels.java | 7 +- .../resources/dedup/oozie_app/workflow.xml | 2 +- .../java/eu/dnetlib/pace/DedupLocalTest.java | 233 +++++++++- .../test/java/eu/dnetlib/pace/GraphDraw.java | 111 +++++ .../eu/dnetlib/pace/config/ds.tree.conf.json | 67 +++ .../pace/config/pub.prod.tree.conf.json | 401 +++++++++++++++++ .../pace/config/publication.current.conf.json | 12 +- .../pace/examples/publications.to.fix.json | 4 + .../parameters/createSimRels_parameters.json | 6 + dnet-dedup.ipr | 113 +++++ dnet-dedup.iws | 418 ++++++++++++++++++ .../pace/util/BlockProcessorForTesting.java | 232 ++++++++++ .../clustering/ClusteringFunctionTest.java | 17 +- .../pace/comparators/ComparatorTest.java | 38 +- .../eu/dnetlib/pace/config/ConfigTest.java | 18 + .../eu/dnetlib/pace/config/pub.prod.conf.json | 402 +++++++++++++++++ .../pace/config/publication.example.json | 1 + pom.xml | 18 +- release.properties | 11 - 24 files changed, 2093 insertions(+), 196 deletions(-) create mode 100644 dnet-dedup-test/src/test/java/eu/dnetlib/pace/GraphDraw.java create mode 100644 dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json create mode 100644 dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.prod.tree.conf.json create mode 100644 dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/publications.to.fix.json create mode 100644 dnet-dedup.ipr create mode 100644 dnet-dedup.iws create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.prod.conf.json create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.example.json delete mode 100644 release.properties diff --git a/dhp-build/dhp-build-properties-maven-plugin/test.properties b/dhp-build/dhp-build-properties-maven-plugin/test.properties index 5733ce3..6810a00 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/test.properties +++ b/dhp-build/dhp-build-properties-maven-plugin/test.properties @@ -1,2 +1,2 @@ -# Mon May 03 16:05:14 CEST 2021 +# Mon Sep 13 14:51:29 CEST 2021 projectPropertyKey=projectPropertyValue diff --git a/dnet-dedup-test/job-override.properties b/dnet-dedup-test/job-override.properties index ce77812..1801507 100644 --- a/dnet-dedup-test/job-override.properties +++ b/dnet-dedup-test/job-override.properties @@ -1,5 +1,6 @@ -entitiesPath = /tmp/prod_provision/graph/01_graph_raw/publication -workingPath = /user/michele.debonis/erf_test/workingdir +#entitiesPath = /tmp/prod_provision/graph/01_graph_raw/publication +entitiesPath = /tmp/publications_test_dump +workingPath = /user/michele.debonis/erf_test/workingdirtree dedupConfPath = /user/michele.debonis/erf_test/pubs.tree.conf.json numPartitions = 20 -useTree = true \ No newline at end of file +useTree = false \ No newline at end of file diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 59d9ea9..14f6794 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -132,6 +132,17 @@ com.jayway.jsonpath json-path + + org.mockito + mockito-core + test + + + + org.mockito + mockito-junit-jupiter + test + diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java index c60ac80..ab7cb2c 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java @@ -1,34 +1,22 @@ package eu.dnetlib; -import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.hash.Hashing; import eu.dnetlib.graph.GraphProcessor; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.WfConfig; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.MapDocumentComparator; -import eu.dnetlib.pace.tree.JsonListMatch; -import eu.dnetlib.pace.tree.LevensteinTitle; -import eu.dnetlib.pace.tree.SizeMatch; -import eu.dnetlib.pace.tree.TitleVersionMatch; -import eu.dnetlib.pace.tree.support.TreeProcessor; -import eu.dnetlib.pace.util.BlockProcessor; +import eu.dnetlib.pace.util.BlockProcessorForTesting; import eu.dnetlib.pace.util.MapDocumentUtil; -import eu.dnetlib.pace.util.Reporter; import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.reporter.SparkReporter; import eu.dnetlib.support.Block; import eu.dnetlib.support.ConnectedComponent; import eu.dnetlib.support.Relation; -import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.graphx.Edge; import org.apache.spark.rdd.RDD; @@ -39,7 +27,6 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import scala.Serializable; import scala.Tuple2; -import scala.math.Ordering; import java.nio.charset.Charset; import java.util.*; @@ -98,15 +85,15 @@ public class Deduper implements Serializable { } public static JavaRDD computeRelations( - JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { + JavaSparkContext context, JavaPairRDD blocks, DedupConfig config, boolean useTree) { Map accumulators = Utility.constructAccumulator(config, context.sc()); return blocks .flatMapToPair( it -> { final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config) - .processSortedBlock(it._1(), it._2().getDocuments(), reporter); + new BlockProcessorForTesting(config) + .processSortedBlock(it._1(), it._2().getDocuments(), reporter, useTree); return reporter.getRelations().iterator(); }) .mapToPair(it -> new Tuple2<>(it._1() + it._2(), new Relation(it._1(), it._2(), "simRel"))) @@ -114,138 +101,7 @@ public class Deduper implements Serializable { .map(Tuple2::_2); } - public static Queue prepareQueue(final Iterable documents, DedupConfig config) { - final Queue queue = new PriorityQueue<>(100, new MapDocumentComparator(config.getWf().getOrderField())); - - final Set seen = new HashSet(); - final int queueMaxSize = config.getWf().getQueueMaxSize(); - - documents.forEach(doc -> { - if (queue.size() <= queueMaxSize) { - final String id = doc.getIdentifier(); - - if (!seen.contains(id)) { - seen.add(id); - queue.add(doc); - } - } - }); - - return queue; - } - - public static JavaRDD computePublicationRelations( - JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { - - return blocks. - flatMapToPair((PairFlatMapFunction, String, String>) - it -> { - List> relations = new ArrayList<>(); - - if (it._2().getDocuments().size()>1) { - - Queue queue = prepareQueue(it._2().getDocuments(), config); - - while (!queue.isEmpty()) { - - final MapDocument pivot = queue.remove(); - final String idPivot = pivot.getIdentifier(); - - WfConfig wf = config.getWf(); - final Field fieldsPivot = pivot.values(wf.getOrderField()); - final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue(); - - if (fieldPivot != null) { - int i = 0; - for (final MapDocument curr : queue) { - final String idCurr = curr.getIdentifier(); - - if (config.getWf().getSkipList().contains(StringUtils.substringBetween(idCurr, "|", "::"))) { - break; - } - - if (i > wf.getSlidingWindowSize()) { - break; - } - - final Field fieldsCurr = curr.values(wf.getOrderField()); - final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue(); - - if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - - double score = 0.0; - Map params = new HashMap<>(); - params.put("jpath_value", "$.value"); - params.put("jpath_classid", "$.qualifier.classid"); - JsonListMatch jsonListMatch = new JsonListMatch(params); - double result = jsonListMatch.compare(pivot.getFieldMap().get("pid"), curr.getFieldMap().get("pid"), config); - if (result > 0.5) //if the result of the comparison is greater than the threshold - score += 10.0; //high score because it should match when the first condition is satisfied - else - score += 0.0; - - TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); - double result1 = titleVersionMatch.compare(pivot.getFieldMap().get("title"), curr.getFieldMap().get("title"), config); - SizeMatch sizeMatch = new SizeMatch(params); - double result2 = sizeMatch.compare(pivot.getFieldMap().get("authors"), curr.getFieldMap().get("authors"), config); - if ((result1 == 1.0 && result2 == 1.0) || (result1 == -1.0 && result2 == 1.0) || (result1 == 1.0 && result2 == -1.0) || (result1 == -1.0 && result2 == -1.0)) - score += 0.0; - else - score -= 1.0; - - LevensteinTitle levensteinTitle = new LevensteinTitle(params); - double result3 = levensteinTitle.compare(pivot.getFieldMap().get("title"), curr.getFieldMap().get("title"), config); - score += result3; - - if (score >= 0.99) { - relations.add(new Tuple2<>(idPivot, idCurr)); - relations.add(new Tuple2<>(idCurr, idPivot)); - } - - } - } - } - } - } - - return relations.iterator(); - }) - .mapToPair(it -> new Tuple2<>(it._1() + it._2(), new Relation(it._1(), it._2(), "simRel"))) - .reduceByKey((a,b) -> a) - .map(Tuple2::_2); - } - - public static boolean comparePublications(MapDocument a, MapDocument b, DedupConfig config){ - - double score = 0.0; - Map params = new HashMap<>(); - params.put("jpath_value", "$.value"); - params.put("jpath_classid", "$.qualifier.classid"); - JsonListMatch jsonListMatch = new JsonListMatch(params); - double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config); - if (result > 0.5) //if the result of the comparison is greater than the threshold - score += 1.0; - else - score += 0.0; - - TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); - double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); - SizeMatch sizeMatch = new SizeMatch(params); - double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); - if ((result1 == 1.0 && result2 == 1.0) || (result1 == -1.0 && result2 == 1.0) || (result1 == 1.0 && result2 == -1.0) || (result1 == -1.0 && result2 == -1.0)) - score += 0.0; - else - score -= 1.0; - - LevensteinTitle levensteinTitle = new LevensteinTitle(params); - double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); - score += result3; - - return score >= 0.99; - - } - - public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath){ + public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath, boolean useTree){ JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); @@ -261,7 +117,7 @@ public class Deduper implements Serializable { JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); // create relations by comparing only elements in the same group - JavaRDD relations = Deduper.computeRelations(sc, blocks, dedupConf); + JavaRDD relations = Deduper.computeRelations(sc, blocks, dedupConf, useTree); // save the simrel in the workingdir spark diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java index ff6ce58..a66f7eb 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java @@ -61,7 +61,7 @@ public class SparkCreateDedupEntity extends AbstractSparkJob { log.info("dedupConfPath: '{}'", dedupConfPath); log.info("numPartitions: '{}'", numPartitions); - DedupConfig dedupConf = DedupConfig.load(readResource("/jobs/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)); + DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(dedupConfPath)); JavaPairRDD entities = spark .read() diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java index cdc971f..8459a96 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java @@ -82,12 +82,7 @@ public class SparkCreateSimRels extends AbstractSparkJob { JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig); // create relations by comparing only elements in the same group - JavaRDD relations; - - if (useTree) - relations = Deduper.computeRelations(sc, blocks, dedupConfig); - else - relations = Deduper.computePublicationRelations(sc, blocks, dedupConfig); + JavaRDD relations = Deduper.computeRelations(sc, blocks, dedupConfig, useTree); // save the simrel in the workingdir spark diff --git a/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml b/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml index 8a5aa44..a42086a 100644 --- a/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml +++ b/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml @@ -131,7 +131,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --conf spark.dynamicAllocation.enabled=false + --conf spark.dynamicAllocation.enabled=true --entitiesPath${entitiesPath} --workingPath${workingPath} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java index 85fee87..ad6fa29 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java @@ -5,37 +5,76 @@ import eu.dnetlib.jobs.SparkCreateDedupEntity; import eu.dnetlib.jobs.SparkCreateMergeRels; import eu.dnetlib.jobs.SparkCreateSimRels; import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.tree.JsonListMatch; +import eu.dnetlib.pace.tree.LevensteinTitle; +import eu.dnetlib.pace.tree.SizeMatch; +import eu.dnetlib.pace.tree.TitleVersionMatch; +import eu.dnetlib.pace.tree.support.TreeProcessor; +import eu.dnetlib.pace.util.BlockProcessorForTesting; +import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.support.ArgumentApplicationParser; +import eu.dnetlib.support.Block; +import eu.dnetlib.support.Relation; +import jdk.nashorn.internal.ir.annotations.Ignore; +import org.apache.commons.crypto.utils.IoUtils; import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import scala.Tuple2; +import java.awt.event.WindowAdapter; +import java.awt.event.WindowEvent; +import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.InputStreamReader; import java.net.URISyntaxException; import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; -@Disabled +@ExtendWith(MockitoExtension.class) +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +@TestInstance(TestInstance.Lifecycle.PER_CLASS) public class DedupLocalTest extends DedupTestUtils { static SparkSession spark; static DedupConfig config; static JavaSparkContext context; + private static Object lock = new Object(); + GraphDraw frame = new GraphDraw("Test Window"); + final String entitiesPath = Paths - .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/orgs_dump").toURI()) + .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/organization").toURI()) .toFile() .getAbsolutePath(); + final static String workingPath = "/tmp/working_dir"; final static String numPartitions = "20"; - final static String dedupConfPath = "/eu/dnetlib/pace/config/orgs.tree.conf.json"; - final static String simRelsPath = workingPath + "/organization_simrel"; - final static String mergeRelsPath = workingPath + "/organization_mergerel"; - final static String dedupEntityPath = workingPath + "/organization_dedupentity"; + final String dedupConfPath = Paths + .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/orgs.tree.conf.json").toURI()) + .toFile() + .getAbsolutePath(); + + final static String simRelsPath = workingPath + "/simrels"; + final static String mergeRelsPath = workingPath + "/mergerels"; + final static String dedupEntityPath = workingPath + "/dedupentities"; public DedupLocalTest() throws URISyntaxException { } @@ -48,11 +87,11 @@ public class DedupLocalTest extends DedupTestUtils { } @BeforeAll - public static void setup() throws IOException { + public void setup() throws IOException { cleanup(); - config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/orgs.tree.conf.json", DedupLocalTest.class)); + config = DedupConfig.load(readFileFromHDFS(dedupConfPath)); spark = SparkSession .builder() @@ -63,7 +102,25 @@ public class DedupLocalTest extends DedupTestUtils { } + @AfterAll + public static void finalCleanUp() throws IOException { + cleanup(); + } + + protected static String readFileFromHDFS(String filePath) throws IOException { + + Path path=new Path(filePath); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path))); + try { + return String.join("", br.lines().collect(Collectors.toList())); + } finally { + br.close(); + } + } + @Test + @Order(1) public void createSimRelTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createSimRels_parameters.json", SparkCreateSimRels.class)); @@ -73,16 +130,21 @@ public class DedupLocalTest extends DedupTestUtils { "-e", entitiesPath, "-w", workingPath, "-np", numPartitions, - "-dc", dedupConfPath + "-dc", dedupConfPath, + "-ut", "true" }); new SparkCreateSimRels( parser, spark ).run(); + + long simrels_number = spark.read().load(simRelsPath).count(); + System.out.println("simrels_number = " + simrels_number); } @Test + @Order(2) public void createMergeRelTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createMergeRels_parameters.json", SparkCreateMergeRels.class)); @@ -102,6 +164,7 @@ public class DedupLocalTest extends DedupTestUtils { } @Test + @Order(3) public void createDedupEntityTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)); @@ -120,7 +183,8 @@ public class DedupLocalTest extends DedupTestUtils { ).run(); } - @Test + @Test //full deduplication workflow test + @Ignore public void deduplicationTest() throws IOException { long before_simrels = System.currentTimeMillis(); @@ -128,7 +192,8 @@ public class DedupLocalTest extends DedupTestUtils { config, spark, entitiesPath, - simRelsPath + simRelsPath, + true ); long simrels_time = System.currentTimeMillis() - before_simrels; @@ -167,4 +232,148 @@ public class DedupLocalTest extends DedupTestUtils { cleanup(); } -} \ No newline at end of file + + @Test //test the match between two JSON + @Ignore + public void matchTest() throws Exception { + String json1 = "{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"sysimport:crosswalk:repository\", \"classname\": \"sysimport:crosswalk:repository\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"pid\": [], \"oaiprovenance\": {\"originDescription\": {\"metadataNamespace\": \"\", \"harvestDate\": \"2021-06-10T10:03:36.091Z\", \"baseURL\": \"file%3A%2F%2F%2Fvar%2Flib%2Fdnet%2Fdata%2Fsygma%2Fnew_ingestion%2Fcrossref\", \"datestamp\": \"\", \"altered\": true, \"identifier\": \"\"}}, \"relevantdate\": [], \"contributor\": [], \"id\": \"50|sygma_______::3bbb03e6ec8df0d219b2d2165ea1d446\", \"subject\": [], \"lastupdatetimestamp\": 1628684944004, \"author\": [{\"surname\": \"Pan\", \"fullname\": \"Pan, Mengwu\", \"pid\": [], \"name\": \"Mengwu\", \"rank\": 1}, {\"surname\": \"Blattner\", \"fullname\": \"Blattner, Christine\", \"pid\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"sysimport:crosswalk:repository\", \"classname\": \"sysimport:crosswalk:repository\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"orcid_pending\", \"classname\": \"Open Researcher and Contributor ID\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"0000-0002-7250-5273\"}], \"name\": \"Christine\", \"rank\": 2}], \"collectedfrom\": [{\"value\": \"Sygma\", \"key\": \"10|openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9\"}], \"instance\": [{\"refereed\": {\"classid\": \"UNKNOWN\", \"classname\": \"Unknown\", \"schemename\": \"dnet:review_levels\", \"schemeid\": \"dnet:review_levels\"}, \"hostedby\": {\"value\": \"Cancers\", \"key\": \"10|issn__online::69ba871b903253074dcf4054e619afff\"}, \"license\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"sysimport:crosswalk:repository\", \"classname\": \"sysimport:crosswalk:repository\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"value\": \"https://creativecommons.org/licenses/by/4.0/\"}, \"url\": [\"http://dx.doi.org/10.3390/cancers13040745\"], \"pid\": [], \"distributionlocation\": \"\", \"alternateIdentifier\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"sysimport:crosswalk:repository\", \"classname\": \"sysimport:crosswalk:repository\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"doi\", \"classname\": \"Digital Object Identifier\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"10.3390/cancers13040745\"}], \"collectedfrom\": {\"value\": \"Sygma\", \"key\": \"10|openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9\"}, \"accessright\": {\"classid\": \"OPEN\", \"classname\": \"Open Access\", \"schemename\": \"dnet:access_modes\", \"schemeid\": \"dnet:access_modes\"}, \"instancetype\": {\"classid\": \"0001\", \"classname\": \"Article\", \"schemename\": \"dnet:publication_resource\", \"schemeid\": \"dnet:publication_resource\"}}], \"resulttype\": {\"classid\": \"publication\", \"classname\": \"publication\", \"schemename\": \"dnet:result_typologies\", \"schemeid\": \"dnet:result_typologies\"}, \"dateofcollection\": \"2021-06-10T10:03:36.091Z\", \"fulltext\": [], \"dateoftransformation\": \"2021-07-20T16:59:21.682Z\", \"description\": [], \"format\": [], \"journal\": {\"issnPrinted\": \"\", \"vol\": \"13\", \"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"sysimport:crosswalk:repository\", \"classname\": \"sysimport:crosswalk:repository\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"name\": \"Cancers\", \"iss\": \"4\", \"sp\": \"745\", \"edition\": \"\", \"issnOnline\": \"2072-6694\", \"ep\": \"\", \"issnLinking\": \"\"}, \"coverage\": [], \"externalReference\": [], \"language\": {\"classid\": \"eng\", \"classname\": \"English\", \"schemename\": \"dnet:languages\", \"schemeid\": \"dnet:languages\"}, \"bestaccessright\": {\"classid\": \"OPEN\", \"classname\": \"Open Access\", \"schemename\": \"dnet:access_modes\", \"schemeid\": \"dnet:access_modes\"}, \"country\": [], \"extraInfo\": [], \"originalId\": [\"10.3390/cancers13040745\", \"50|sygma_______::3bbb03e6ec8df0d219b2d2165ea1d446\"], \"source\": [], \"context\": [], \"title\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"sysimport:crosswalk:repository\", \"classname\": \"sysimport:crosswalk:repository\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"main title\", \"classname\": \"main title\", \"schemename\": \"dnet:dataCite_title\", \"schemeid\": \"dnet:dataCite_title\"}, \"value\": \"Regulation of p53 by E3s\"}]}"; + String json2 = "{\"dataInfo\": {\"invisible\": false, \"trust\": \"0.9\", \"deletedbyinference\": false, \"inferred\": false, \"provenanceaction\": {\"classid\": \"sysimport:actionset\", \"classname\": \"sysimport:actionset\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}}, \"resourcetype\": {\"classid\": \"0001\", \"classname\": \"0001\", \"schemename\": \"dnet:dataCite_resource\", \"schemeid\": \"dnet:dataCite_resource\"}, \"pid\": [{\"qualifier\": {\"classid\": \"doi\", \"classname\": \"doi\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"10.3390/cancers13040745\"}], \"bestaccessright\": {\"classid\": \"OPEN\", \"classname\": \"Open Access\", \"schemename\": \"dnet:access_modes\", \"schemeid\": \"dnet:access_modes\"}, \"relevantdate\": [{\"qualifier\": {\"classid\": \"created\", \"classname\": \"created\", \"schemename\": \"dnet:dataCite_date\", \"schemeid\": \"dnet:dataCite_date\"}, \"value\": \"2021-02-12T21:12:10Z\"}, {\"qualifier\": {\"classid\": \"published-online\", \"classname\": \"published-online\", \"schemename\": \"dnet:dataCite_date\", \"schemeid\": \"dnet:dataCite_date\"}, \"value\": \"2021-02-11\"}], \"contributor\": [], \"id\": \"50|doi_________::3bbb03e6ec8df0d219b2d2165ea1d446\", \"description\": [{\"value\": \"More than 40 years of research on p53 have given us tremendous knowledge about this protein. Today we know that p53 plays a role in different biological processes such as proliferation, invasion, pluripotency, metabolism, cell cycle control, ROS (reactive oxygen species) production, apoptosis, inflammation and autophagy. In the nucleus, p53 functions as a bona-fide transcription factor which activates and represses transcription of a number of target genes. In the cytoplasm, p53 can interact with proteins of the apoptotic machinery and by this also induces cell death. Despite being so important for the fate of the cell, expression levels of p53 are kept low in unstressed cells and the protein is largely inactive. The reason for the low expression level is that p53 is efficiently degraded by the ubiquitin-proteasome system and the vast inactivity of the tumor suppressor protein under normal growth conditions is due to the absence of activating and the presence of inactivating posttranslational modifications. E3s are important enzymes for these processes as they decorate p53 with ubiquitin and small ubiquitin-like proteins and by this control p53 degradation, stability and its subcellular localization. In this review, we provide an overview about E3s that target p53 and discuss the connection between p53, E3s and tumorigenesis.\"}], \"lastupdatetimestamp\": 1613647061057, \"author\": [{\"fullname\": \"Mengwu Pan\", \"pid\": [{\"qualifier\": {\"classid\": \"URL\", \"classname\": \"URL\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"https://academic.microsoft.com/#/detail/3128025883\"}], \"rank\": 1}, {\"fullname\": \"Christine Blattner\", \"pid\": [{\"qualifier\": {\"classid\": \"URL\", \"classname\": \"URL\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"https://academic.microsoft.com/#/detail/3126711219\"}, {\"dataInfo\": {\"invisible\": false, \"trust\": \"0.9\", \"deletedbyinference\": false, \"inferred\": false, \"provenanceaction\": {\"classid\": \"sysimport:actionset\", \"classname\": \"sysimport:actionset\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}}, \"qualifier\": {\"classid\": \"orcid_pending\", \"classname\": \"orcid_pending\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"http://orcid.org/0000-0002-7250-5273\"}, {\"dataInfo\": {\"invisible\": false, \"trust\": \"0.91\", \"deletedbyinference\": false, \"inferred\": false, \"provenanceaction\": {\"classid\": \"sysimport:crosswalk:entityregistry\", \"classname\": \"Harvested\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}}, \"qualifier\": {\"classid\": \"orcid\", \"classname\": \"orcid\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"0000-0002-7250-5273\"}], \"rank\": 2}], \"collectedfrom\": [{\"value\": \"Crossref\", \"key\": \"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2\"}, {\"value\": \"UnpayWall\", \"key\": \"10|openaire____::8ac8380272269217cb09a928c8caa993\"}, {\"value\": \"ORCID\", \"key\": \"10|openaire____::806360c771262b4d6770e7cdf04b5c5a\"}, {\"value\": \"Microsoft Academic Graph\", \"key\": \"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a\"}], \"instance\": [{\"hostedby\": {\"value\": \"Cancers\", \"key\": \"10|doajarticles::69ba871b903253074dcf4054e619afff\"}, \"license\": {\"value\": \"https://creativecommons.org/licenses/by/4.0/\"}, \"url\": [\"https://www.mdpi.com/2072-6694/13/4/745/pdf\", \"http://dx.doi.org/10.3390/cancers13040745\"], \"pid\": [{\"qualifier\": {\"classid\": \"doi\", \"classname\": \"doi\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"10.3390/cancers13040745\"}], \"dateofacceptance\": {\"value\": \"2021-02-11\"}, \"collectedfrom\": {\"value\": \"Crossref\", \"key\": \"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2\"}, \"accessright\": {\"classid\": \"OPEN\", \"classname\": \"Open Access\", \"schemename\": \"dnet:access_modes\", \"schemeid\": \"dnet:access_modes\"}, \"instancetype\": {\"classid\": \"0001\", \"classname\": \"Article\", \"schemename\": \"dnet:publication_resource\", \"schemeid\": \"dnet:publication_resource\"}}, {\"hostedby\": {\"value\": \"Cancers\", \"key\": \"10|doajarticles::69ba871b903253074dcf4054e619afff\"}, \"license\": {\"value\": \"cc-by\"}, \"url\": [\"https://res.mdpi.com/d_attachment/cancers/cancers-13-00745/article_deploy/cancers-13-00745.pdf\"], \"pid\": [{\"qualifier\": {\"classid\": \"doi\", \"classname\": \"doi\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"10.3390/cancers13040745\"}], \"collectedfrom\": {\"value\": \"UnpayWall\", \"key\": \"10|openaire____::8ac8380272269217cb09a928c8caa993\"}, \"accessright\": {\"classid\": \"OPEN\", \"classname\": \"Open Access\", \"schemename\": \"dnet:access_modes\", \"schemeid\": \"dnet:access_modes\"}, \"instancetype\": {\"classid\": \"0001\", \"classname\": \"Article\", \"schemename\": \"dnet:publication_resource\", \"schemeid\": \"dnet:publication_resource\"}}, {\"hostedby\": {\"value\": \"Cancers\", \"key\": \"10|doajarticles::69ba871b903253074dcf4054e619afff\"}, \"url\": [\"https://www.mdpi.com/2072-6694/13/4/745\", \"https://www.mdpi.com/2072-6694/13/4/745/pdf\", \"https://academic.microsoft.com/#/detail/3128658507\"], \"pid\": [{\"qualifier\": {\"classid\": \"doi\", \"classname\": \"doi\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"10.3390/cancers13040745\"}], \"collectedfrom\": {\"value\": \"Microsoft Academic Graph\", \"key\": \"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a\"}, \"accessright\": {\"classid\": \"OPEN\", \"classname\": \"Open Access\", \"schemename\": \"dnet:access_modes\", \"schemeid\": \"dnet:access_modes\"}, \"instancetype\": {\"classid\": \"0001\", \"classname\": \"Article\", \"schemename\": \"dnet:publication_resource\", \"schemeid\": \"dnet:publication_resource\"}}], \"dateofcollection\": \"2021-02-18T11:17:41Z\", \"fulltext\": [], \"dateofacceptance\": {\"value\": \"2021-02-11\"}, \"format\": [], \"journal\": {\"vol\": \"13\", \"sp\": \"745\", \"issnOnline\": \"2072-6694\", \"name\": \"Cancers\"}, \"measures\": [], \"subject\": [{\"qualifier\": {\"classid\": \"keywords\", \"classname\": \"keywords\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Cancer Research\"}, {\"qualifier\": {\"classid\": \"keywords\", \"classname\": \"keywords\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Oncology\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Carcinogenesis\"}, {\"dataInfo\": {\"invisible\": false, \"trust\": \"0.51921105\", \"deletedbyinference\": false, \"inferred\": false, \"provenanceaction\": {\"classid\": \"sysimport:actionset\", \"classname\": \"sysimport:actionset\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}}, \"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"medicine.disease_cause\"}, {\"dataInfo\": {\"invisible\": false, \"trust\": \"0.51921105\", \"deletedbyinference\": false, \"inferred\": false, \"provenanceaction\": {\"classid\": \"sysimport:actionset\", \"classname\": \"sysimport:actionset\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}}, \"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"medicine\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Cytoplasm\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Transcription factor\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Cell biology\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Ubiquitin\"}, {\"dataInfo\": {\"invisible\": false, \"trust\": \"0.5209853\", \"deletedbyinference\": false, \"inferred\": false, \"provenanceaction\": {\"classid\": \"sysimport:actionset\", \"classname\": \"sysimport:actionset\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}}, \"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"biology.protein\"}, {\"dataInfo\": {\"invisible\": false, \"trust\": \"0.5209853\", \"deletedbyinference\": false, \"inferred\": false, \"provenanceaction\": {\"classid\": \"sysimport:actionset\", \"classname\": \"sysimport:actionset\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}}, \"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"biology\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Cell\"}, {\"dataInfo\": {\"invisible\": false, \"trust\": \"0.51552147\", \"deletedbyinference\": false, \"inferred\": false, \"provenanceaction\": {\"classid\": \"sysimport:actionset\", \"classname\": \"sysimport:actionset\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}}, \"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"medicine.anatomical_structure\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Programmed cell death\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Autophagy\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Chemistry\"}, {\"qualifier\": {\"classid\": \"MAG\", \"classname\": \"Microsoft Academic Graph classification\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Subcellular localization\"}], \"coverage\": [], \"externalReference\": [], \"publisher\": {\"value\": \"MDPI AG\"}, \"resulttype\": {\"classid\": \"publication\", \"classname\": \"publication\", \"schemename\": \"dnet:result_typologies\", \"schemeid\": \"dnet:result_typologies\"}, \"country\": [], \"extraInfo\": [], \"originalId\": [\"cancers13040745\", \"10.3390/cancers13040745\", \"50|doiboost____::3bbb03e6ec8df0d219b2d2165ea1d446\", \"3128658507\"], \"source\": [{\"value\": \"Crossref\"}, {}], \"context\": [], \"title\": [{\"qualifier\": {\"classid\": \"alternative title\", \"classname\": \"alternative title\", \"schemename\": \"dnet:dataCite_title\", \"schemeid\": \"dnet:dataCite_title\"}, \"value\": \"Regulation of p53 by E3s\"}, {\"qualifier\": {\"classid\": \"main title\", \"classname\": \"main title\", \"schemename\": \"dnet:dataCite_title\", \"schemeid\": \"dnet:dataCite_title\"}, \"value\": \"Regulation of p53 by E3s\"}]}"; + + MapDocument a = MapDocumentUtil.asMapDocumentWithJPath(config, json1); + MapDocument b = MapDocumentUtil.asMapDocumentWithJPath(config, json2); + + boolean result = new TreeProcessor(config).compare(a,b); + + System.out.println("Tree Processor Result = " + result); + + } + + @Test //test the dedup of a group of JSON + @Ignore + public void dedupTest() throws Exception { + final String entitiesPath = Paths + .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI()) + .toFile() + .getAbsolutePath(); + + DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(Paths + .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.prod.tree.conf.json").toURI()) + .toFile() + .getAbsolutePath())); + + JavaPairRDD mapDocuments = context + .textFile(entitiesPath) + .mapToPair( + (PairFunction) s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); + + // create blocks for deduplication + JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); + for (Tuple2 b : blocks.collect()) { + System.out.println("*******GROUPS********"); + System.out.println("key = " + b._1()); + System.out.println("elements = " + b._2().elements()); + System.out.println("items = " + b._2().getDocuments().stream().map(d -> d.getIdentifier()).collect(Collectors.joining(","))); + System.out.println("*********************"); + } + + // create relations by comparing only elements in the same group + JavaRDD relations = Deduper.computeRelations(context, blocks, dedupConf, true); + for (Relation r: relations.collect()) { + System.out.println("*******RELATIONS*******"); + System.out.println("source = " + r.getSource()); + System.out.println("target = " + r.getTarget()); + System.out.println("***********************"); + } + + //vertexes + List vertexes = mapDocuments.map(doc -> doc._1()).collect(); + + //edges + List> edges = new ArrayList<>(); + relations.collect().stream().forEach(r -> edges.add(new Tuple2(r.getSource(), r.getTarget()))); + + drawGraph(vertexes, edges); + + cleanup(); + + synchronized(lock) { + while (frame.isVisible()) + try { + lock.wait(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public void drawGraph(List vertexes, List> edges) { + + frame.setSize(2000,2000); + + frame.setVisible(true); + + int nVertexes = vertexes.size(); + int nRow = (int) Math.round(Math.floor(Math.sqrt(nVertexes))); + int nCol = (int) Math.round(Math.ceil(Math.sqrt(nVertexes))); + int cStepSize = 500; + int rStepSize = 200; + for(int i = 0; i < nCol; i++){ + for(int j = 0; j < nRow; j++){ + frame.addNode(vertexes.get(i + nRow*j), 200 + j*cStepSize, 50 + i*rStepSize); + if (i+nRow*j == nVertexes) + continue; + } + } + + for (Tuple2 e: edges) { + frame.addEdge(vertexes.indexOf(e._1()), vertexes.indexOf(e._2())); + } + + frame.addWindowListener(new WindowAdapter() { + @Override + public void windowClosing(WindowEvent e) { + synchronized (lock) { + frame.setVisible(false); + lock.notify(); + } + } + }); + } + +} + +// function mocking the tree processor by considering every comparison instead of using early exits +// private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { +// +// double score = 0.0; +// //LAYER 1 - comparison of the PIDs json lists +// Map params = new HashMap<>(); +// params.put("jpath_value", "$.value"); +// params.put("jpath_classid", "$.qualifier.classid"); +// JsonListMatch jsonListMatch = new JsonListMatch(params); +// double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config); +// if (result >= 0.5) //if the result of the comparison is greater than the threshold +// score += 10.0; //high score because it should match when the first condition is satisfied +// else +// score += 0.0; +// +// //LAYER 2 - comparison of the title version and the size of the authors lists +// TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); +// double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); +// SizeMatch sizeMatch = new SizeMatch(params); +// double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); +// if (Math.min(result1, result2) != 0) +// score+=0; +// else +// score-=2; +// +// //LAYER 3 - computation of levenshtein on titles +// LevensteinTitle levensteinTitle = new LevensteinTitle(params); +// double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); +// score += Double.isNaN(result3)?0.0:result3;; +// +// return score >= 0.99; +// } \ No newline at end of file diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/GraphDraw.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/GraphDraw.java new file mode 100644 index 0000000..094962a --- /dev/null +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/GraphDraw.java @@ -0,0 +1,111 @@ +package eu.dnetlib.pace; + +import java.util.*; +import java.awt.*; +import java.awt.event.*; +import javax.swing.*; + +public class GraphDraw extends JFrame { + int width; + int height; + + ArrayList nodes; + ArrayList edges; + + public GraphDraw() { //Constructor + this.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + nodes = new ArrayList(); + edges = new ArrayList(); + width = 15; + height = 15; + } + + public GraphDraw(String name) { //Construct with label + this.setTitle(name); + this.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + nodes = new ArrayList(); + edges = new ArrayList(); + width = 15; + height = 15; + } + + class Node { + int x, y; + String name; + Color color; + + public Node(String myName, int myX, int myY) { + x = myX; + y = myY; + name = myName; + color = Color.white; + } + + public Node(String myName, int myX, int myY, Color myColor) { + x = myX; + y = myY; + name = myName; + color = myColor; + } + } + + class edge { + int i,j; + + public edge(int ii, int jj) { + i = ii; + j = jj; + } + } + + public void addNode(String name, int x, int y) { + //add a node at pixel (x,y) + nodes.add(new Node(name,x,y)); + this.repaint(); + } + public void addEdge(int i, int j) { + //add an edge between nodes i and j + edges.add(new edge(i,j)); + this.repaint(); + } + + public void paint(Graphics g) { // draw the nodes and edges + FontMetrics f = g.getFontMetrics(); + int nodeHeight = Math.max(height, f.getHeight()); + g.setColor(Color.black); + for (edge e : edges) { + g.drawLine(nodes.get(e.i).x, nodes.get(e.i).y, + nodes.get(e.j).x, nodes.get(e.j).y); + } + + for (Node n : nodes) { + n.name = n.name.substring(0, 20); + int nodeWidth = Math.max(width, f.stringWidth(n.name)+width/2); + g.setColor(n.color); + g.fillRoundRect(n.x-nodeWidth/2, n.y-nodeHeight/2, + nodeWidth, nodeHeight, 2, 2); + g.setColor(Color.black); + g.drawRoundRect(n.x-nodeWidth/2, n.y-nodeHeight/2, nodeWidth, nodeHeight, 2, 2); + + g.drawString(n.name, n.x-f.stringWidth(n.name)/2, + n.y+f.getHeight()/5); + } + } +} + +class testGraphDraw { + //Here is some example syntax for the GraphDraw class + public static void main(String[] args) { + GraphDraw frame = new GraphDraw("Test Window"); + + frame.setSize(400,300); + + frame.setVisible(true); + + frame.addNode("a", 50,50); + frame.addNode("b", 100,100); + frame.addNode("longNode", 200,200); + frame.addEdge(0,1); + frame.addEdge(0,2); + } +} diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json new file mode 100644 index 0000000..c22786a --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json @@ -0,0 +1,67 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "datasource", + "orderField" : "name", + "queueMaxSize" : "2000", + "groupMaxSize" : "50", + "slidingWindowSize" : "200", + "idPath":"$.id", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true", + "maxIterations": "20" + }, + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "name" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "name" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + ], + "decisionTree" : { + "start": { + "fields": [ + { + "field": "websiteurl", + "comparator": "domainExactMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1, + "aggregation": "AVG", + "positive": "layer2", + "negative": "NO_MATCH", + "undefined": "layer2", + "ignoreUndefined": "true" + }, + "layer2": { + "fields": [ + { + "field": "name", + "comparator": "jaroWinkler", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + } + } + ], + "threshold": 0.9, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } + }, + "model" : [ + { "name" : "name", "type" : "String", "path" : "$.name" }, + { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" } + ], + "blacklists" : { + "legalname" : [] + }, + "synonyms": {} + } +} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.prod.tree.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.prod.tree.conf.json new file mode 100644 index 0000000..ec41e05 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.prod.tree.conf.json @@ -0,0 +1,401 @@ +{ + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "result", + "subEntityType": "resulttype", + "subEntityValue": "publication", + "orderField": "title", + "queueMaxSize": "200", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "50", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.id" + }, + "pace": { + "clustering" : [ + { "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } }, + { "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} } + ], + "decisionTree": { + "start": { + "fields": [ + { + "field": "pid", + "comparator": "jsonListMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "jpath_value": "$.value", + "jpath_classid": "$.qualifier.classid" + } + }, + { + "field": "pid", + "comparator": "jsonListMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "jpath_value": "$.value", + "jpath_classid": "$.qualifier.classid", + "crossCompare": "alternateid" + } + } + ], + "threshold": 0.5, + "aggregation": "MAX", + "positive": "layer1", + "negative": "layer2", + "undefined": "layer2", + "ignoreUndefined": "true" + }, + "layer1": { + "fields": [ + { + "field": "title", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.9, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + }, + "layer2": { + "fields": [ + { + "field": "title", + "comparator": "titleVersionMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + }, + { + "field": "authors", + "comparator": "sizeMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1.0, + "aggregation": "AND", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "false" + }, + "layer3": { + "fields": [ + { + "field": "title", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.99, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "doi", + "type": "String", + "path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value" + }, + { + "name": "altdoi", + "type": "String", + "path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value" + }, + { + "name": "pid", + "type": "JSON", + "path": "$.instance[*].pid[*]", + "overrideMatch": "true" + }, + { + "name": "alternateid", + "type": "JSON", + "path": "$.instance[*].alternateIdentifier[*]", + "overrideMatch": "true" + }, + { + "name": "title", + "type": "String", + "path": "$.title[?(@.qualifier.classid == 'main title')].value", + "length": 250, + "size": 5 + }, + { + "name": "authors", + "type": "List", + "path": "$.author[*].fullname", + "size": 200 + }, + { + "name": "resulttype", + "type": "String", + "path": "$.resulttype.classid" + } + ], + "blacklists": { + "title": [ + "(?i)^Data Management Plan", + "^Inside Front Cover$", + "(?i)^Poster presentations$", + "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", + "^Problems with perinatal pathology\\.?$", + "(?i)^Cases? of Puerperal Convulsions$", + "(?i)^Operative Gyna?ecology$", + "(?i)^Mind the gap\\!?\\:?$", + "^Chronic fatigue syndrome\\.?$", + "^Cartas? ao editor Letters? to the Editor$", + "^Note from the Editor$", + "^Anesthesia Abstract$", + "^Annual report$", + "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", + "(?i)^Graph and Table of Infectious Diseases?$", + "^Presentation$", + "(?i)^Reviews and Information on Publications$", + "(?i)^PUBLIC HEALTH SERVICES?$", + "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", + "(?i)^Adrese autora$", + "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", + "(?i)^Acknowledgement to Referees$", + "(?i)^Behçet's disease\\.?$", + "(?i)^Isolation and identification of restriction endonuclease.*$", + "(?i)^CEREBROVASCULAR DISEASES?.?$", + "(?i)^Screening for abdominal aortic aneurysms?\\.?$", + "^Event management$", + "(?i)^Breakfast and Crohn's disease.*\\.?$", + "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", + "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", + "^Gushi hakubutsugaku$", + "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", + "^Intestinal spirocha?etosis$", + "^Treatment of Rodent Ulcer$", + "(?i)^\\W*Cloud Computing\\W*$", + "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", + "^Free Communications, Poster Presentations: Session [A-F]$", + "^“The Historical Aspects? of Quackery\\.?”$", + "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", + "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", + "(?i)^Case Report$", + "^Boletín Informativo$", + "(?i)^Glioblastoma Multiforme$", + "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", + "^Zaměstnanecké výhody$", + "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", + "(?i)^Carotid body tumours?\\.?$", + "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", + "^Avant-propos$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", + "(?i)^PUBLIC HEALTH VERSUS THE STATE$", + "^Viñetas de Cortázar$", + "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", + "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", + "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", + "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", + "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", + "^Aus der AGMB$", + "^Znanstveno-stručni prilozi$", + "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", + "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", + "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", + "^Finanční analýza podniku$", + "^Financial analysis( of business)?$", + "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", + "^Jikken nihon shūshinsho$", + "(?i)^CORONER('|s)(s|') INQUESTS$", + "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", + "(?i)^Consultants' contract(s)?$", + "(?i)^Upute autorima$", + "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", + "^Joshi shin kokubun$", + "^Kōtō shōgaku dokuhon nōson'yō$", + "^Jinjō shōgaku shōka$", + "^Shōgaku shūjichō$", + "^Nihon joshi dokuhon$", + "^Joshi shin dokuhon$", + "^Chūtō kanbun dokuhon$", + "^Wabun dokuhon$", + "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", + "(?i)^cardiac rehabilitation$", + "(?i)^Analytical summary$", + "^Thesaurus resolutionum Sacrae Congregationis Concilii$", + "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", + "^Prikazi i osvrti$", + "^Rodinný dům s provozovnou$", + "^Family house with an establishment$", + "^Shinsei chūtō shin kokugun$", + "^Pulmonary alveolar proteinosis(\\.?)$", + "^Shinshū kanbun$", + "^Viñeta(s?) de Rodríguez$", + "(?i)^RUBRIKA UREDNIKA$", + "^A Matching Model of the Academic Publication Market$", + "^Yōgaku kōyō$", + "^Internetový marketing$", + "^Internet marketing$", + "^Chūtō kokugo dokuhon$", + "^Kokugo dokuhon$", + "^Antibiotic Cover for Dental Extraction(s?)$", + "^Strategie podniku$", + "^Strategy of an Enterprise$", + "(?i)^respiratory disease(s?)(\\.?)$", + "^Award(s?) for Gallantry in Civil Defence$", + "^Podniková kultura$", + "^Corporate Culture$", + "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", + "^Pracovní motivace$", + "^Work Motivation$", + "^Kaitei kōtō jogaku dokuhon$", + "^Konsolidovaná účetní závěrka$", + "^Consolidated Financial Statements$", + "(?i)^intracranial tumour(s?)$", + "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", + "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", + "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", + "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", + "^Úroveň motivačního procesu jako způsobu vedení lidí$", + "^The level of motivation process as a leadership$", + "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", + "(?i)^news and events$", + "(?i)^NOVOSTI I DOGAĐAJI$", + "^Sansū no gakushū$", + "^Posouzení informačního systému firmy a návrh změn$", + "^Information System Assessment and Proposal for ICT Modification$", + "^Stresové zatížení pracovníků ve vybrané profesi$", + "^Stress load in a specific job$", + "^Sunday: Poster Sessions, Pt.*$", + "^Monday: Poster Sessions, Pt.*$", + "^Wednesday: Poster Sessions, Pt.*", + "^Tuesday: Poster Sessions, Pt.*$", + "^Analýza reklamy$", + "^Analysis of advertising$", + "^Shōgaku shūshinsho$", + "^Shōgaku sansū$", + "^Shintei joshi kokubun$", + "^Taishō joshi kokubun dokuhon$", + "^Joshi kokubun$", + "^Účetní uzávěrka a účetní závěrka v ČR$", + "(?i)^The \"?Causes\"? of Cancer$", + "^Normas para la publicación de artículos$", + "^Editor('|s)(s|') [Rr]eply$", + "^Editor(’|s)(s|’) letter$", + "^Redaktoriaus žodis$", + "^DISCUSSION ON THE PRECEDING PAPER$", + "^Kōtō shōgaku shūshinsho jidōyō$", + "^Shōgaku nihon rekishi$", + "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", + "^Préface$", + "^Occupational [Hh]ealth [Ss]ervices.$", + "^In Memoriam Professor Toshiyuki TAKESHIMA$", + "^Účetní závěrka ve vybraném podniku.*$", + "^Financial statements in selected company$", + "^Abdominal [Aa]ortic [Aa]neurysms.*$", + "^Pseudomyxoma peritonei$", + "^Kazalo autora$", + "(?i)^uvodna riječ$", + "^Motivace jako způsob vedení lidí$", + "^Motivation as a leadership$", + "^Polyfunkční dům$", + "^Multi\\-funkcional building$", + "^Podnikatelský plán$", + "(?i)^Podnikatelský záměr$", + "(?i)^Business Plan$", + "^Oceňování nemovitostí$", + "^Marketingová komunikace$", + "^Marketing communication$", + "^Sumario Analítico$", + "^Riječ uredništva$", + "^Savjetovanja i priredbe$", + "^Índice$", + "^(Starobosanski nadpisi).*$", + "^Vzdělávání pracovníků v organizaci$", + "^Staff training in organization$", + "^(Life Histories of North American Geometridae).*$", + "^Strategická analýza podniku$", + "^Strategic Analysis of an Enterprise$", + "^Sadržaj$", + "^Upute suradnicima$", + "^Rodinný dům$", + "(?i)^Fami(l)?ly house$", + "^Upute autorima$", + "^Strategic Analysis$", + "^Finanční analýza vybraného podniku$", + "^Finanční analýza$", + "^Riječ urednika$", + "(?i)^Content(s?)$", + "(?i)^Inhalt$", + "^Jinjō shōgaku shūshinsho jidōyō$", + "(?i)^Index$", + "^Chūgaku kokubun kyōkasho$", + "^Retrato de una mujer$", + "^Retrato de un hombre$", + "^Kōtō shōgaku dokuhon$", + "^Shotōka kokugo$", + "^Shōgaku dokuhon$", + "^Jinjō shōgaku kokugo dokuhon$", + "^Shinsei kokugo dokuhon$", + "^Teikoku dokuhon$", + "^Instructions to Authors$", + "^KİTAP TAHLİLİ$", + "^PRZEGLĄD PIŚMIENNICTWA$", + "(?i)^Presentación$", + "^İçindekiler$", + "(?i)^Tabl?e of contents$", + "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", + "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", + "^Editorial( Board)?$", + "(?i)^Editorial \\(English\\)$", + "^Editörden$", + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*", + "(?i)^.*authors['’′]? reply\\.?$", + "(?i)^.*authors['’′]? response\\.?$" + ] + }, + "synonyms": {} + } +} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json index d471ccb..6e1aa77 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json @@ -6,10 +6,10 @@ "subEntityType": "resulttype", "subEntityValue": "publication", "orderField": "title", - "queueMaxSize": "2000", - "groupMaxSize": "100", - "maxChildren": "100", - "slidingWindowSize": "200", + "queueMaxSize": "5000", + "groupMaxSize": "2000", + "maxChildren": "1000", + "slidingWindowSize": "50", "rootBuilder": [ "result", "resultProject_outcome_isProducedBy", @@ -29,8 +29,7 @@ }, "pace": { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree": { @@ -129,6 +128,7 @@ ], "blacklists": { "title": [ + "(?i)^Data Management Plan", "^Inside Front Cover$", "(?i)^Poster presentations$", "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/publications.to.fix.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/publications.to.fix.json new file mode 100644 index 0000000..9cffaef --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/publications.to.fix.json @@ -0,0 +1,4 @@ +{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "pid": [], "oaiprovenance": {"originDescription": {"metadataNamespace": "", "harvestDate": "2021-06-10T10:03:36.091Z", "baseURL": "file%3A%2F%2F%2Fvar%2Flib%2Fdnet%2Fdata%2Fsygma%2Fnew_ingestion%2Fcrossref", "datestamp": "", "altered": true, "identifier": ""}}, "relevantdate": [], "contributor": [], "id": "50|sygma_______::3bbb03e6ec8df0d219b2d2165ea1d446", "subject": [], "lastupdatetimestamp": 1628684944004, "author": [{"surname": "Pan", "fullname": "Pan, Mengwu", "pid": [], "name": "Mengwu", "rank": 1}, {"surname": "Blattner", "fullname": "Blattner, Christine", "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "orcid_pending", "classname": "Open Researcher and Contributor ID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "0000-0002-7250-5273"}], "name": "Christine", "rank": 2}], "collectedfrom": [{"value": "Sygma", "key": "10|openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9"}], "instance": [{"refereed": {"classid": "UNKNOWN", "classname": "Unknown", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"value": "Cancers", "key": "10|issn__online::69ba871b903253074dcf4054e619afff"}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "https://creativecommons.org/licenses/by/4.0/"}, "url": ["http://dx.doi.org/10.3390/cancers13040745"], "pid": [], "distributionlocation": "", "alternateIdentifier": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.3390/cancers13040745"}], "collectedfrom": {"value": "Sygma", "key": "10|openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "dateofcollection": "2021-06-10T10:03:36.091Z", "fulltext": [], "dateoftransformation": "2021-07-20T16:59:21.682Z", "description": [], "format": [], "journal": {"issnPrinted": "", "vol": "13", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "name": "Cancers", "iss": "4", "sp": "745", "edition": "", "issnOnline": "2072-6694", "ep": "", "issnLinking": ""}, "coverage": [], "externalReference": [], "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": ["10.3390/cancers13040745", "50|sygma_______::3bbb03e6ec8df0d219b2d2165ea1d446"], "source": [], "context": [], "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Regulation of p53 by E3s"}]} +{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "sysimport:actionset", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "resourcetype": {"classid": "0001", "classname": "0001", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}, "pid": [{"qualifier": {"classid": "doi", "classname": "doi", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.3390/cancers13040745"}], "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "relevantdate": [{"qualifier": {"classid": "created", "classname": "created", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2021-02-12T21:12:10Z"}, {"qualifier": {"classid": "published-online", "classname": "published-online", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2021-02-11"}], "contributor": [], "id": "50|doi_________::3bbb03e6ec8df0d219b2d2165ea1d446", "description": [{"value": "More than 40 years of research on p53 have given us tremendous knowledge about this protein. Today we know that p53 plays a role in different biological processes such as proliferation, invasion, pluripotency, metabolism, cell cycle control, ROS (reactive oxygen species) production, apoptosis, inflammation and autophagy. In the nucleus, p53 functions as a bona-fide transcription factor which activates and represses transcription of a number of target genes. In the cytoplasm, p53 can interact with proteins of the apoptotic machinery and by this also induces cell death. Despite being so important for the fate of the cell, expression levels of p53 are kept low in unstressed cells and the protein is largely inactive. The reason for the low expression level is that p53 is efficiently degraded by the ubiquitin-proteasome system and the vast inactivity of the tumor suppressor protein under normal growth conditions is due to the absence of activating and the presence of inactivating posttranslational modifications. E3s are important enzymes for these processes as they decorate p53 with ubiquitin and small ubiquitin-like proteins and by this control p53 degradation, stability and its subcellular localization. In this review, we provide an overview about E3s that target p53 and discuss the connection between p53, E3s and tumorigenesis."}], "lastupdatetimestamp": 1613647061057, "author": [{"fullname": "Mengwu Pan", "pid": [{"qualifier": {"classid": "URL", "classname": "URL", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "https://academic.microsoft.com/#/detail/3128025883"}], "rank": 1}, {"fullname": "Christine Blattner", "pid": [{"qualifier": {"classid": "URL", "classname": "URL", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "https://academic.microsoft.com/#/detail/3126711219"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "sysimport:actionset", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "orcid_pending", "classname": "orcid_pending", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "http://orcid.org/0000-0002-7250-5273"}, {"dataInfo": {"invisible": false, "trust": "0.91", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "orcid", "classname": "orcid", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "0000-0002-7250-5273"}], "rank": 2}], "collectedfrom": [{"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}, {"value": "UnpayWall", "key": "10|openaire____::8ac8380272269217cb09a928c8caa993"}, {"value": "ORCID", "key": "10|openaire____::806360c771262b4d6770e7cdf04b5c5a"}, {"value": "Microsoft Academic Graph", "key": "10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"}], "instance": [{"hostedby": {"value": "Cancers", "key": "10|doajarticles::69ba871b903253074dcf4054e619afff"}, "license": {"value": "https://creativecommons.org/licenses/by/4.0/"}, "url": ["https://www.mdpi.com/2072-6694/13/4/745/pdf", "http://dx.doi.org/10.3390/cancers13040745"], "pid": [{"qualifier": {"classid": "doi", "classname": "doi", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.3390/cancers13040745"}], "dateofacceptance": {"value": "2021-02-11"}, "collectedfrom": {"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}, {"hostedby": {"value": "Cancers", "key": "10|doajarticles::69ba871b903253074dcf4054e619afff"}, "license": {"value": "cc-by"}, "url": ["https://res.mdpi.com/d_attachment/cancers/cancers-13-00745/article_deploy/cancers-13-00745.pdf"], "pid": [{"qualifier": {"classid": "doi", "classname": "doi", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.3390/cancers13040745"}], "collectedfrom": {"value": "UnpayWall", "key": "10|openaire____::8ac8380272269217cb09a928c8caa993"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}, {"hostedby": {"value": "Cancers", "key": "10|doajarticles::69ba871b903253074dcf4054e619afff"}, "url": ["https://www.mdpi.com/2072-6694/13/4/745", "https://www.mdpi.com/2072-6694/13/4/745/pdf", "https://academic.microsoft.com/#/detail/3128658507"], "pid": [{"qualifier": {"classid": "doi", "classname": "doi", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.3390/cancers13040745"}], "collectedfrom": {"value": "Microsoft Academic Graph", "key": "10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "dateofcollection": "2021-02-18T11:17:41Z", "fulltext": [], "dateofacceptance": {"value": "2021-02-11"}, "format": [], "journal": {"vol": "13", "sp": "745", "issnOnline": "2072-6694", "name": "Cancers"}, "measures": [], "subject": [{"qualifier": {"classid": "keywords", "classname": "keywords", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Cancer Research"}, {"qualifier": {"classid": "keywords", "classname": "keywords", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Oncology"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Carcinogenesis"}, {"dataInfo": {"invisible": false, "trust": "0.51921105", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "sysimport:actionset", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "medicine.disease_cause"}, {"dataInfo": {"invisible": false, "trust": "0.51921105", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "sysimport:actionset", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "medicine"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Cytoplasm"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Transcription factor"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Cell biology"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Ubiquitin"}, {"dataInfo": {"invisible": false, "trust": "0.5209853", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "sysimport:actionset", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "biology.protein"}, {"dataInfo": {"invisible": false, "trust": "0.5209853", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "sysimport:actionset", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "biology"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Cell"}, {"dataInfo": {"invisible": false, "trust": "0.51552147", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "sysimport:actionset", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "medicine.anatomical_structure"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Programmed cell death"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Autophagy"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Chemistry"}, {"qualifier": {"classid": "MAG", "classname": "Microsoft Academic Graph classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Subcellular localization"}], "coverage": [], "externalReference": [], "publisher": {"value": "MDPI AG"}, "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "country": [], "extraInfo": [], "originalId": ["cancers13040745", "10.3390/cancers13040745", "50|doiboost____::3bbb03e6ec8df0d219b2d2165ea1d446", "3128658507"], "source": [{"value": "Crossref"}, {}], "context": [], "title": [{"qualifier": {"classid": "alternative title", "classname": "alternative title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Regulation of p53 by E3s"}, {"qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Regulation of p53 by E3s"}]} +{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "pmc", "classname": "PubMed Central ID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "PMC7916862"}], "oaiprovenance": {"originDescription": {"metadataNamespace": "", "harvestDate": "2021-03-06T14:46:06.784Z", "baseURL": "http%3A%2F%2Fwww.pubmedcentral.nih.gov%2Foai%2Foai.cgi", "datestamp": "2021-03-01", "altered": true, "identifier": "oai:pubmedcentral.nih.gov:7916862"}}, "relevantdate": [], "contributor": [], "id": "50|pmc_________::dd5d1e7d3c85628d0bee7c42191a12ec", "description": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "Simple Summary The p53 protein is a transcription factor that initiates cell cycle arrest and apoptosis and by this counteracts tumorigenesis. Because of its anti-proliferative activity, p53 levels are usually low as the protein is rapidly degraded, unless its anti-tumoral activity is required. E3s play an important role in this process. While at earlier times only E3s that target p53 for degradation had been identified, more recent years showed that E3s also control p53 localization and its activity, even independently of its degradation. In addition, more and more E3s that target p53 have been identified in the last years. With this review, we want to provide an overview about the E3s that target p53 and how they control p53 abundance and activity. Abstract More than 40 years of research on p53 have given us tremendous knowledge about this protein. Today we know that p53 plays a role in different biological processes such as proliferation, invasion, pluripotency, metabolism, cell cycle control, ROS (reactive oxygen species) production, apoptosis, inflammation and autophagy. In the nucleus, p53 functions as a bona-fide transcription factor which activates and represses transcription of a number of target genes. In the cytoplasm, p53 can interact with proteins of the apoptotic machinery and by this also induces cell death. Despite being so important for the fate of the cell, expression levels of p53 are kept low in unstressed cells and the protein is largely inactive. The reason for the low expression level is that p53 is efficiently degraded by the ubiquitin-proteasome system and the vast inactivity of the tumor suppressor protein under normal growth conditions is due to the absence of activating and the presence of inactivating posttranslational modifications. E3s are important enzymes for these processes as they decorate p53 with ubiquitin and small ubiquitin-like proteins and by this control p53 degradation, stability and its subcellular localization. In this review, we provide an overview about E3s that target p53 and discuss the connection between p53, E3s and tumorigenesis."}], "lastupdatetimestamp": 1628684994646, "author": [{"surname": "Pan", "fullname": "Pan, Mengwu", "pid": [], "name": "Mengwu", "rank": 1}, {"surname": "Blattner", "fullname": "Blattner, Christine", "pid": [], "name": "Christine", "rank": 2}], "collectedfrom": [{"value": "PubMed Central", "key": "10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357"}], "instance": [{"refereed": {"classid": "UNKNOWN", "classname": "Unknown", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"value": "Europe PubMed Central", "key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"}, "url": ["http://europepmc.org/articles/PMC7916862"], "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "pmc", "classname": "PubMed Central ID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "PMC7916862"}], "distributionlocation": "", "alternateIdentifier": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.3390/cancers13040745"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "pmid", "classname": "PubMed ID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": ""}], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "2021-02-01"}, "collectedfrom": {"value": "PubMed Central", "key": "10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "dateofcollection": "2021-03-06T14:46:06.784Z", "fulltext": [], "dateoftransformation": "2021-07-26T07:24:22.011Z", "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "2021-02-01"}, "format": [], "journal": {"issnPrinted": "", "vol": "13", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "name": "Cancers", "iss": "4", "sp": "", "edition": "", "issnOnline": "2072-6694", "ep": "", "issnLinking": ""}, "subject": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "value": "Review"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "value": "p53"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "value": "E3s"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "value": "tumor"}], "coverage": [], "externalReference": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "MDPI"}, "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": ["oai:pubmedcentral.nih.gov:7916862", "50|od_______267::c016e9ed23d32e62d3fb2c7d2fbfae96"], "source": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "Cancers"}], "context": [], "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Regulation of p53 by E3s"}]} +{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "pid": [], "oaiprovenance": {"originDescription": {"metadataNamespace": "http://www.openarchives.org/OAI/2.0/oai_dc/", "harvestDate": "2021-02-15T11:13:27.469Z", "baseURL": "https%3A%2F%2Fdoaj.org%2Foai.article", "datestamp": "2021-02-12T00:01:31Z", "altered": true, "identifier": "oai:doaj.org/article:6b18c6ca9eab44ee932083fff4369221"}}, "relevantdate": [], "contributor": [], "id": "50|doajarticles::a46b8cb8138cfb67e1dbfc241171154d", "description": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "More than 40 years of research on p53 have given us tremendous knowledge about this protein. Today we know that p53 plays a role in different biological processes such as proliferation, invasion, pluripotency, metabolism, cell cycle control, ROS (reactive oxygen species) production, apoptosis, inflammation and autophagy. In the nucleus, p53 functions as a bona-fide transcription factor which activates and represses transcription of a number of target genes. In the cytoplasm, p53 can interact with proteins of the apoptotic machinery and by this also induces cell death. Despite being so important for the fate of the cell, expression levels of p53 are kept low in unstressed cells and the protein is largely inactive. The reason for the low expression level is that p53 is efficiently degraded by the ubiquitin-proteasome system and the vast inactivity of the tumor suppressor protein under normal growth conditions is due to the absence of activating and the presence of inactivating posttranslational modifications. E3s are important enzymes for these processes as they decorate p53 with ubiquitin and small ubiquitin-like proteins and by this control p53 degradation, stability and its subcellular localization. In this review, we provide an overview about E3s that target p53 and discuss the connection between p53, E3s and tumorigenesis."}], "lastupdatetimestamp": 1628685120332, "author": [{"fullname": "Mengwu Pan", "pid": [], "rank": 1}, {"fullname": "Christine Blattner", "pid": [], "rank": 2}], "collectedfrom": [{"value": "DOAJ-Articles", "key": "10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}], "instance": [{"refereed": {"classid": "UNKNOWN", "classname": "Unknown", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"value": "Cancers", "key": "10|doajarticles::69ba871b903253074dcf4054e619afff"}, "url": ["https://www.mdpi.com/2072-6694/13/4/745"], "pid": [], "distributionlocation": "", "alternateIdentifier": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.3390/cancers13040745"}], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "2021-02-01"}, "collectedfrom": {"value": "DOAJ-Articles", "key": "10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "dateofcollection": "2021-02-15T11:13:27.469Z", "fulltext": [], "dateoftransformation": "2021-07-01T02:30:05.029Z", "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "2021-02-01"}, "format": [], "journal": {"issnPrinted": "2072-6694", "vol": "13", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "name": "Cancers", "iss": "745", "sp": "", "edition": "", "issnOnline": "", "ep": "", "issnLinking": ""}, "subject": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "value": "p53"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "value": "E3s"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "value": "tumor"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "lcsh", "classname": "lcsh", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "lcsh:Neoplasms. Tumors. Oncology. Including cancer and carcinogens"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "lcsh", "classname": "lcsh", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "lcsh:RC254-282"}], "coverage": [], "externalReference": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "MDPI AG"}, "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": ["50|doajarticles::a46b8cb8138cfb67e1dbfc241171154d", "oai:doaj.org/article:6b18c6ca9eab44ee932083fff4369221"], "source": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "Cancers, Vol 13, Iss 745, p 745 (2021)"}], "context": [], "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Regulation of p53 by E3s"}]}'] \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createSimRels_parameters.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createSimRels_parameters.json index 62482aa..4956a3d 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createSimRels_parameters.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createSimRels_parameters.json @@ -22,5 +22,11 @@ "paramLongName": "dedupConfPath", "paramDescription": "path of the dedup configuration", "paramRequired": true + }, + { + "paramName": "ut", + "paramLongName": "useTree", + "paramDescription": "chose the tree configuration or not", + "paramRequired": true } ] \ No newline at end of file diff --git a/dnet-dedup.ipr b/dnet-dedup.ipr new file mode 100644 index 0000000..677e437 --- /dev/null +++ b/dnet-dedup.ipr @@ -0,0 +1,113 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dnet-dedup.iws b/dnet-dedup.iws new file mode 100644 index 0000000..57de9a0 --- /dev/null +++ b/dnet-dedup.iws @@ -0,0 +1,418 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java new file mode 100644 index 0000000..9bf05f3 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java @@ -0,0 +1,232 @@ +package eu.dnetlib.pace.util; + +import com.google.common.collect.Lists; +import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.WfConfig; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.MapDocumentComparator; +import eu.dnetlib.pace.tree.JsonListMatch; +import eu.dnetlib.pace.tree.LevensteinTitle; +import eu.dnetlib.pace.tree.SizeMatch; +import eu.dnetlib.pace.tree.TitleVersionMatch; +import eu.dnetlib.pace.tree.support.FieldStats; +import eu.dnetlib.pace.tree.support.TreeProcessor; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.*; + +public class BlockProcessorForTesting { + + public static final List accumulators= new ArrayList<>(); + + private static final Log log = LogFactory.getLog(eu.dnetlib.pace.util.BlockProcessorForTesting.class); + + private DedupConfig dedupConf; + + public static void constructAccumulator( final DedupConfig dedupConf) { + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1")); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()))); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list")); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); + } + + public BlockProcessorForTesting(DedupConfig dedupConf) { + this.dedupConf = dedupConf; + } + + public void processSortedBlock(final String key, final List documents, final Reporter context, boolean useTree) { + if (documents.size() > 1) { +// log.info("reducing key: '" + key + "' records: " + q.size()); + process(prepare(documents), context, useTree); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + + public void process(final String key, final Iterable documents, final Reporter context, boolean useTree) { + + final Queue q = prepare(documents); + + if (q.size() > 1) { +// log.info("reducing key: '" + key + "' records: " + q.size()); + process(simplifyQueue(q, key, context), context, useTree); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + + private Queue prepare(final Iterable documents) { + final Queue queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField())); + + final Set seen = new HashSet(); + final int queueMaxSize = dedupConf.getWf().getQueueMaxSize(); + + documents.forEach(doc -> { + if (queue.size() <= queueMaxSize) { + final String id = doc.getIdentifier(); + + if (!seen.contains(id)) { + seen.add(id); + queue.add(doc); + } + } + }); + + return queue; + } + + private Queue simplifyQueue(final Queue queue, final String ngram, final Reporter context) { + final Queue q = new LinkedList<>(); + + String fieldRef = ""; + final List tempResults = Lists.newArrayList(); + + while (!queue.isEmpty()) { + final MapDocument result = queue.remove(); + + final String orderFieldName = dedupConf.getWf().getOrderField(); + final Field orderFieldValue = result.values(orderFieldName); + if (!orderFieldValue.isEmpty()) { + final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue()); + if (field.equals(fieldRef)) { + tempResults.add(result); + } else { + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); + tempResults.clear(); + tempResults.add(result); + fieldRef = field; + } + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1); + } + } + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); + + return q; + } + + private void populateSimplifiedQueue(final Queue q, + final List tempResults, + final Reporter context, + final String fieldRef, + final String ngram) { + WfConfig wf = dedupConf.getWf(); + if (tempResults.size() < wf.getGroupMaxSize()) { + q.addAll(tempResults); + } else { + context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size()); +// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram); + } + } + + private void process(final Queue queue, final Reporter context, boolean useTree) { + + while (!queue.isEmpty()) { + + final MapDocument pivot = queue.remove(); + final String idPivot = pivot.getIdentifier(); + + WfConfig wf = dedupConf.getWf(); + final Field fieldsPivot = pivot.values(wf.getOrderField()); + final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue(); + + if (fieldPivot != null) { + int i = 0; + for (final MapDocument curr : queue) { + final String idCurr = curr.getIdentifier(); + + if (mustSkip(idCurr)) { + + context.incrementCounter(wf.getEntityType(), "skip list", 1); + + break; + } + + if (i > wf.getSlidingWindowSize()) { + break; + } + + final Field fieldsCurr = curr.values(wf.getOrderField()); + final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue(); + + if (!idCurr.equals(idPivot) && (fieldCurr != null)) { + +// if (new TreeProcessor(dedupConf).compare(pivot, curr) == true && publicationCompare(pivot, curr, dedupConf) == false) +// emitOutput(true, idPivot, idCurr, context); +// + if(useTree) + emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); + else + emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); + + } + } + } + } + } + + private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { + + double score = 0.0; + //LAYER 1 - comparison of the PIDs json lists + Map params = new HashMap<>(); + params.put("jpath_value", "$.value"); + params.put("jpath_classid", "$.qualifier.classid"); + JsonListMatch jsonListMatch = new JsonListMatch(params); + double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config); + if (result >= 0.5) //if the result of the comparison is greater than the threshold + score += 10.0; //high score because it should match when the first condition is satisfied + else + score += 0.0; + + //LAYER 2 - comparison of the title version and the size of the authors lists + TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); + double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); + SizeMatch sizeMatch = new SizeMatch(params); + double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); + if (Math.min(result1, result2) != 0) + score+=0; + else + score-=2; + + //LAYER 3 - computation of levenshtein on titles + LevensteinTitle levensteinTitle = new LevensteinTitle(params); + double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); + score += Double.isNaN(result3)?0.0:result3; + + return score >= 0.99; + } + + private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { + + if (result) { + writeSimilarity(context, idPivot, idCurr); + context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); + } + } + + private boolean mustSkip(final String idPivot) { + return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); + } + + private String getNsPrefix(final String id) { + return StringUtils.substringBetween(id, "|", "::"); + } + + private void writeSimilarity(final Reporter context, final String from, final String to) { + final String type = dedupConf.getWf().getEntityType(); + + context.emit(type, from, to); + context.emit(type, to, from); + } +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 91a3274..8657b58 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -47,7 +47,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testNgramPairs() { params.put("ngramLen", 3); - params.put("max", 1); + params.put("max", 2); final ClusteringFunction np = new NgramPairs(params); @@ -59,7 +59,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testSortedNgramPairs() { params.put("ngramLen", 3); - params.put("max", 1); + params.put("max", 2); final ClusteringFunction np = new SortedNgramPairs(params); @@ -70,6 +70,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s2 = "Pisa University"; System.out.println(s2); System.out.println(np.apply(conf, Lists.newArrayList(title(s2)))); + + final String s3 = "Parco Tecnologico Agroalimentare Umbria"; + System.out.println(s3); + System.out.println(np.apply(conf, Lists.newArrayList(title(s3)))); + } @Test @@ -132,6 +137,14 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(s); System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + s = "JRC Open Power Plants Database (JRC-PPDB-OPEN)"; + System.out.println(s); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + + s = "JRC Open Power Plants Database"; + System.out.println(s); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + } @Test diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 060526b..6bdd1ad 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -14,7 +14,7 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions; import java.util.HashMap; import java.util.Map; - +@TestInstance(TestInstance.Lifecycle.PER_CLASS) public class ComparatorTest extends AbstractPaceFunctions { private Map params; @@ -119,6 +119,42 @@ public class ComparatorTest extends AbstractPaceFunctions { } + @Test + public void jaroWinklerTest() { + + final JaroWinkler jaroWinkler = new JaroWinkler(params); + + double result = jaroWinkler.distance("Sofia", "Sofìa", conf); + System.out.println("result = " + result); + + result = jaroWinkler.distance("University of Victoria Dataverse", "University of Windsor Dataverse", conf); + System.out.println("result = " + result); + + result = jaroWinkler.distance("Victoria Dataverse", "Windsor Dataverse", conf); + System.out.println("result = " + result); + + final Levenstein levenstein = new Levenstein(params); + + result = levenstein.distance("Victoria", "Windsor", conf); + System.out.println("result = " + result); + + //University of Victoria Dataverse + //University of British Columbia Dataverse + //University of Windsor Dataverse + //University of Waterloo Dataverse + //University of Toronto Dataverse + //University of Ottawa Dataverse + } + + @Test + public void levensteinTitleTest() { + + final LevensteinTitle levensteinTitle = new LevensteinTitle(params); + double result = levensteinTitle.distance("JRC: Open Power Plants Database", "JRC Open Power Plants Database (JRC-PPDB-OPEN)", conf); + + System.out.println("result = " + result); + } + @Test public void jsonListMatchTest(){ diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index dc7b11a..dbf7f08 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.config; import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.clustering.ClusteringClass; import eu.dnetlib.pace.clustering.ClusteringCombiner; import eu.dnetlib.pace.model.Field; @@ -128,6 +129,23 @@ public class ConfigTest extends AbstractPaceTest { assertEquals("doi", combine[2].split(":")[1]); } + @Test + public void filterAndCombineTest() { + + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("pub.prod.conf.json")); + + final String json = readFromClasspath("publication.example.json"); + + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); + + Collection strings = BlacklistAwareClusteringCombiner.filterAndCombine(mapDocument, dedupConf); + + for (String s: strings) { + System.out.println("s = " + s); + } + + } + @Test public void crossCompareTest() { diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.prod.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.prod.conf.json new file mode 100644 index 0000000..ab34ed8 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.prod.conf.json @@ -0,0 +1,402 @@ + +{ + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "result", + "subEntityType": "resulttype", + "subEntityValue": "publication", + "orderField": "title", + "queueMaxSize": "200", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "50", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.id" + }, + "pace": { + "clustering" : [ + { "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } }, + { "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} } + ], + "decisionTree": { + "start": { + "fields": [ + { + "field": "pid", + "comparator": "jsonListMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "jpath_value": "$.value", + "jpath_classid": "$.qualifier.classid" + } + }, + { + "field": "pid", + "comparator": "jsonListMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "jpath_value": "$.value", + "jpath_classid": "$.qualifier.classid", + "crossCompare": "alternateid" + } + } + ], + "threshold": 0.5, + "aggregation": "MAX", + "positive": "layer1", + "negative": "layer2", + "undefined": "layer2", + "ignoreUndefined": "true" + }, + "layer1": { + "fields": [ + { + "field": "title", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.9, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + }, + "layer2": { + "fields": [ + { + "field": "title", + "comparator": "titleVersionMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + }, + { + "field": "authors", + "comparator": "sizeMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1.0, + "aggregation": "AND", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "false" + }, + "layer3": { + "fields": [ + { + "field": "title", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.99, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "doi", + "type": "String", + "path": "$.instance.pid[?(@.qualifier.classid == 'doi')].value" + }, + { + "name": "altdoi", + "type": "String", + "path": "$.instance.alternateIdentifier[?(@.qualifier.classid == 'doi')].value" + }, + { + "name": "pid", + "type": "JSON", + "path": "$.instance.pid", + "overrideMatch": "true" + }, + { + "name": "alternateid", + "type": "JSON", + "path": "$.instance.alternateIdentifier", + "overrideMatch": "true" + }, + { + "name": "title", + "type": "String", + "path": "$.title[?(@.qualifier.classid == 'main title')].value", + "length": 250, + "size": 5 + }, + { + "name": "authors", + "type": "List", + "path": "$.author[*].fullname", + "size": 200 + }, + { + "name": "resulttype", + "type": "String", + "path": "$.resulttype.classid" + } + ], + "blacklists": { + "title": [ + "(?i)^Data Management Plan", + "^Inside Front Cover$", + "(?i)^Poster presentations$", + "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", + "^Problems with perinatal pathology\\.?$", + "(?i)^Cases? of Puerperal Convulsions$", + "(?i)^Operative Gyna?ecology$", + "(?i)^Mind the gap\\!?\\:?$", + "^Chronic fatigue syndrome\\.?$", + "^Cartas? ao editor Letters? to the Editor$", + "^Note from the Editor$", + "^Anesthesia Abstract$", + "^Annual report$", + "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", + "(?i)^Graph and Table of Infectious Diseases?$", + "^Presentation$", + "(?i)^Reviews and Information on Publications$", + "(?i)^PUBLIC HEALTH SERVICES?$", + "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", + "(?i)^Adrese autora$", + "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", + "(?i)^Acknowledgement to Referees$", + "(?i)^Behçet's disease\\.?$", + "(?i)^Isolation and identification of restriction endonuclease.*$", + "(?i)^CEREBROVASCULAR DISEASES?.?$", + "(?i)^Screening for abdominal aortic aneurysms?\\.?$", + "^Event management$", + "(?i)^Breakfast and Crohn's disease.*\\.?$", + "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", + "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", + "^Gushi hakubutsugaku$", + "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", + "^Intestinal spirocha?etosis$", + "^Treatment of Rodent Ulcer$", + "(?i)^\\W*Cloud Computing\\W*$", + "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", + "^Free Communications, Poster Presentations: Session [A-F]$", + "^“The Historical Aspects? of Quackery\\.?”$", + "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", + "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", + "(?i)^Case Report$", + "^Boletín Informativo$", + "(?i)^Glioblastoma Multiforme$", + "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", + "^Zaměstnanecké výhody$", + "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", + "(?i)^Carotid body tumours?\\.?$", + "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", + "^Avant-propos$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", + "(?i)^PUBLIC HEALTH VERSUS THE STATE$", + "^Viñetas de Cortázar$", + "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", + "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", + "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", + "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", + "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", + "^Aus der AGMB$", + "^Znanstveno-stručni prilozi$", + "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", + "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", + "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", + "^Finanční analýza podniku$", + "^Financial analysis( of business)?$", + "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", + "^Jikken nihon shūshinsho$", + "(?i)^CORONER('|s)(s|') INQUESTS$", + "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", + "(?i)^Consultants' contract(s)?$", + "(?i)^Upute autorima$", + "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", + "^Joshi shin kokubun$", + "^Kōtō shōgaku dokuhon nōson'yō$", + "^Jinjō shōgaku shōka$", + "^Shōgaku shūjichō$", + "^Nihon joshi dokuhon$", + "^Joshi shin dokuhon$", + "^Chūtō kanbun dokuhon$", + "^Wabun dokuhon$", + "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", + "(?i)^cardiac rehabilitation$", + "(?i)^Analytical summary$", + "^Thesaurus resolutionum Sacrae Congregationis Concilii$", + "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", + "^Prikazi i osvrti$", + "^Rodinný dům s provozovnou$", + "^Family house with an establishment$", + "^Shinsei chūtō shin kokugun$", + "^Pulmonary alveolar proteinosis(\\.?)$", + "^Shinshū kanbun$", + "^Viñeta(s?) de Rodríguez$", + "(?i)^RUBRIKA UREDNIKA$", + "^A Matching Model of the Academic Publication Market$", + "^Yōgaku kōyō$", + "^Internetový marketing$", + "^Internet marketing$", + "^Chūtō kokugo dokuhon$", + "^Kokugo dokuhon$", + "^Antibiotic Cover for Dental Extraction(s?)$", + "^Strategie podniku$", + "^Strategy of an Enterprise$", + "(?i)^respiratory disease(s?)(\\.?)$", + "^Award(s?) for Gallantry in Civil Defence$", + "^Podniková kultura$", + "^Corporate Culture$", + "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", + "^Pracovní motivace$", + "^Work Motivation$", + "^Kaitei kōtō jogaku dokuhon$", + "^Konsolidovaná účetní závěrka$", + "^Consolidated Financial Statements$", + "(?i)^intracranial tumour(s?)$", + "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", + "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", + "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", + "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", + "^Úroveň motivačního procesu jako způsobu vedení lidí$", + "^The level of motivation process as a leadership$", + "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", + "(?i)^news and events$", + "(?i)^NOVOSTI I DOGAĐAJI$", + "^Sansū no gakushū$", + "^Posouzení informačního systému firmy a návrh změn$", + "^Information System Assessment and Proposal for ICT Modification$", + "^Stresové zatížení pracovníků ve vybrané profesi$", + "^Stress load in a specific job$", + "^Sunday: Poster Sessions, Pt.*$", + "^Monday: Poster Sessions, Pt.*$", + "^Wednesday: Poster Sessions, Pt.*", + "^Tuesday: Poster Sessions, Pt.*$", + "^Analýza reklamy$", + "^Analysis of advertising$", + "^Shōgaku shūshinsho$", + "^Shōgaku sansū$", + "^Shintei joshi kokubun$", + "^Taishō joshi kokubun dokuhon$", + "^Joshi kokubun$", + "^Účetní uzávěrka a účetní závěrka v ČR$", + "(?i)^The \"?Causes\"? of Cancer$", + "^Normas para la publicación de artículos$", + "^Editor('|s)(s|') [Rr]eply$", + "^Editor(’|s)(s|’) letter$", + "^Redaktoriaus žodis$", + "^DISCUSSION ON THE PRECEDING PAPER$", + "^Kōtō shōgaku shūshinsho jidōyō$", + "^Shōgaku nihon rekishi$", + "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", + "^Préface$", + "^Occupational [Hh]ealth [Ss]ervices.$", + "^In Memoriam Professor Toshiyuki TAKESHIMA$", + "^Účetní závěrka ve vybraném podniku.*$", + "^Financial statements in selected company$", + "^Abdominal [Aa]ortic [Aa]neurysms.*$", + "^Pseudomyxoma peritonei$", + "^Kazalo autora$", + "(?i)^uvodna riječ$", + "^Motivace jako způsob vedení lidí$", + "^Motivation as a leadership$", + "^Polyfunkční dům$", + "^Multi\\-funkcional building$", + "^Podnikatelský plán$", + "(?i)^Podnikatelský záměr$", + "(?i)^Business Plan$", + "^Oceňování nemovitostí$", + "^Marketingová komunikace$", + "^Marketing communication$", + "^Sumario Analítico$", + "^Riječ uredništva$", + "^Savjetovanja i priredbe$", + "^Índice$", + "^(Starobosanski nadpisi).*$", + "^Vzdělávání pracovníků v organizaci$", + "^Staff training in organization$", + "^(Life Histories of North American Geometridae).*$", + "^Strategická analýza podniku$", + "^Strategic Analysis of an Enterprise$", + "^Sadržaj$", + "^Upute suradnicima$", + "^Rodinný dům$", + "(?i)^Fami(l)?ly house$", + "^Upute autorima$", + "^Strategic Analysis$", + "^Finanční analýza vybraného podniku$", + "^Finanční analýza$", + "^Riječ urednika$", + "(?i)^Content(s?)$", + "(?i)^Inhalt$", + "^Jinjō shōgaku shūshinsho jidōyō$", + "(?i)^Index$", + "^Chūgaku kokubun kyōkasho$", + "^Retrato de una mujer$", + "^Retrato de un hombre$", + "^Kōtō shōgaku dokuhon$", + "^Shotōka kokugo$", + "^Shōgaku dokuhon$", + "^Jinjō shōgaku kokugo dokuhon$", + "^Shinsei kokugo dokuhon$", + "^Teikoku dokuhon$", + "^Instructions to Authors$", + "^KİTAP TAHLİLİ$", + "^PRZEGLĄD PIŚMIENNICTWA$", + "(?i)^Presentación$", + "^İçindekiler$", + "(?i)^Tabl?e of contents$", + "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", + "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", + "^Editorial( Board)?$", + "(?i)^Editorial \\(English\\)$", + "^Editörden$", + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*", + "(?i)^.*authors['’′]? reply\\.?$", + "(?i)^.*authors['’′]? response\\.?$" + ] + }, + "synonyms": {} + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.example.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.example.json new file mode 100644 index 0000000..e15cdc5 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.example.json @@ -0,0 +1 @@ +{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "pid": [], "oaiprovenance": {"originDescription": {"metadataNamespace": "", "harvestDate": "2021-06-10T10:03:36.091Z", "baseURL": "file%3A%2F%2F%2Fvar%2Flib%2Fdnet%2Fdata%2Fsygma%2Fnew_ingestion%2Fcrossref", "datestamp": "", "altered": true, "identifier": ""}}, "relevantdate": [], "contributor": [], "id": "50|sygma_______::3bbb03e6ec8df0d219b2d2165ea1d446", "subject": [], "lastupdatetimestamp": 1628684944004, "author": [{"surname": "Pan", "fullname": "Pan, Mengwu", "pid": [], "name": "Mengwu", "rank": 1}, {"surname": "Blattner", "fullname": "Blattner, Christine", "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "orcid_pending", "classname": "Open Researcher and Contributor ID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "0000-0002-7250-5273"}], "name": "Christine", "rank": 2}], "collectedfrom": [{"value": "Sygma", "key": "10|openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9"}], "instance": [{"refereed": {"classid": "UNKNOWN", "classname": "Unknown", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"value": "Cancers", "key": "10|issn__online::69ba871b903253074dcf4054e619afff"}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "https://creativecommons.org/licenses/by/4.0/"}, "url": ["http://dx.doi.org/10.3390/cancers13040745"], "pid": [], "distributionlocation": "", "alternateIdentifier": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.3390/cancers13040745"}], "collectedfrom": {"value": "Sygma", "key": "10|openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "dateofcollection": "2021-06-10T10:03:36.091Z", "fulltext": [], "dateoftransformation": "2021-07-20T16:59:21.682Z", "description": [], "format": [], "journal": {"issnPrinted": "", "vol": "13", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "name": "Cancers", "iss": "4", "sp": "745", "edition": "", "issnOnline": "2072-6694", "ep": "", "issnLinking": ""}, "coverage": [], "externalReference": [], "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": ["10.3390/cancers13040745", "50|sygma_______::3bbb03e6ec8df0d219b2d2165ea1d446"], "source": [], "context": [], "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Regulation of p53 by E3s"}]} \ No newline at end of file diff --git a/pom.xml b/pom.xml index a19a36c..1e01361 100644 --- a/pom.xml +++ b/pom.xml @@ -228,7 +228,8 @@ 15.0 2.2.0 - 2.9.6 + + 2.6.5 3.3.3 3.5 @@ -260,7 +261,7 @@ true 2.0.1 5.6.1 - ../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-4.0.6-SNAPSHOT.jar + ../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-4.1.8-SNAPSHOT.jar @@ -409,6 +410,19 @@ 2.4.0 + + org.mockito + mockito-core + 3.3.3 + test + + + + org.mockito + mockito-junit-jupiter + 3.3.3 + test + diff --git a/release.properties b/release.properties deleted file mode 100644 index f8c96de..0000000 --- a/release.properties +++ /dev/null @@ -1,11 +0,0 @@ -#release configuration -#Tue Sep 29 12:04:49 CEST 2020 -scm.tagNameFormat=@{project.artifactId}-@{project.version} -pushChanges=true -scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git -preparationGoals=clean verify -projectVersionPolicyId=default -remoteTagging=true -scm.commentPrefix=[maven-release-plugin] -exec.snapshotReleasePluginAllowed=false -completedPhase=check-poms