From 5021e5048f0643979eb55dba3545a1fd4a5e4002 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 29 Sep 2020 12:01:25 +0200 Subject: [PATCH] fixed error in the treeprocessor. it used th=-1 as default value, now it use th=1 --- dnet-dedup-test/job-override.properties | 4 + .../src/main/java/eu/dnetlib/Deduper.java | 34 +++- .../eu/dnetlib/jobs/AbstractSparkJob.java | 48 +++++ .../dnetlib/jobs/SparkCreateDedupEntity.java | 63 +++++++ .../eu/dnetlib/jobs/SparkCreateMergeRels.java | 62 +++++++ .../eu/dnetlib/jobs/SparkCreateSimRels.java | 62 +++++++ .../support/ArgumentApplicationParser.java | 95 ++++++++++ .../main/java/eu/dnetlib/support/Block.java | 9 +- .../dnetlib/support/ConnectedComponent.java | 7 +- .../eu/dnetlib/support/OptionsParameter.java | 37 ++++ .../java/eu/dnetlib/support/Relation.java | 9 + .../dedup/oozie_app/config-default.xml | 18 ++ .../resources/dedup/oozie_app/workflow.xml | 167 ++++++++++++++++++ .../java/eu/dnetlib/pace/DedupLocalTest.java | 68 ++++++- .../pace/config/organization.strict.conf.json | 22 ++- .../pace/examples/openorgs.to.fix.json | 6 + .../createDedupEntity_parameters.json | 26 +++ .../createMergeRels_parameters.json | 26 +++ .../parameters/createSimRels_parameters.json | 26 +++ .../pace/tree/support/TreeNodeDef.java | 2 +- 20 files changed, 768 insertions(+), 23 deletions(-) create mode 100644 dnet-dedup-test/job-override.properties create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateMergeRels.java create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/support/OptionsParameter.java create mode 100644 dnet-dedup-test/src/main/resources/dedup/oozie_app/config-default.xml create mode 100644 dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml create mode 100644 dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/openorgs.to.fix.json create mode 100644 dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json create mode 100644 dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createMergeRels_parameters.json create mode 100644 dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createSimRels_parameters.json diff --git a/dnet-dedup-test/job-override.properties b/dnet-dedup-test/job-override.properties new file mode 100644 index 0000000..f494555 --- /dev/null +++ b/dnet-dedup-test/job-override.properties @@ -0,0 +1,4 @@ +entitiesPath = /tmp/graph_openorgs_and_corda/organization +workingPath = /tmp/openorgs_test/workingpath +dedupConfPath = /tmp/openorgs_test/organization.strict.conf.json +numPartitions = 40 \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java index 39426a7..41fd736 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java @@ -1,5 +1,6 @@ package eu.dnetlib; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.hash.Hashing; import eu.dnetlib.graph.GraphProcessor; import eu.dnetlib.pace.config.DedupConfig; @@ -11,6 +12,7 @@ import eu.dnetlib.reporter.SparkReporter; import eu.dnetlib.support.Block; import eu.dnetlib.support.ConnectedComponent; import eu.dnetlib.support.Relation; +import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.api.java.JavaPairRDD; @@ -27,7 +29,9 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import scala.Serializable; import scala.Tuple2; +import scala.math.Ordering; +import java.nio.charset.Charset; import java.util.*; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -36,6 +40,8 @@ public class Deduper implements Serializable { private static final Log log = LogFactory.getLog(Deduper.class); + private static ObjectMapper mapper = new ObjectMapper(); + public static JavaPairRDD createSortedBlocks( JavaPairRDD mapDocs, DedupConfig config) { final String of = config.getWf().getOrderField(); @@ -71,7 +77,7 @@ public class Deduper implements Serializable { } public static long hash(final String id) { - return Hashing.murmur3_128().hashString(id).asLong(); + return Hashing.murmur3_128().hashString(id, Charset.defaultCharset()).asLong(); } public static ConnectedComponent entityMerger(String key, Iterator values) { @@ -79,7 +85,7 @@ public class Deduper implements Serializable { ConnectedComponent cc = new ConnectedComponent(); cc.setCcId(key); cc.setDocs(StreamSupport.stream(Spliterators.spliteratorUnknownSize(values, Spliterator.ORDERED), false) - .collect(Collectors.toSet())); + .collect(Collectors.toCollection(HashSet::new))); return cc; } @@ -115,6 +121,13 @@ public class Deduper implements Serializable { // create blocks for deduplication JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); + + //TODO test purpose + blocks.foreach(b -> System.out.println("b = " + b)); + blocks = blocks.filter(b -> b._1().equals("ghahos")); + + + // create relations by comparing only elements in the same group JavaRDD relations = Deduper.computeRelations(sc, blocks, dedupConf); @@ -145,15 +158,18 @@ public class Deduper implements Serializable { .map(Relation::toEdgeRdd) .rdd(); + JavaRDD ccs = GraphProcessor + .findCCs(vertexes.rdd(), edgeRdd, maxIterations) + .toJavaRDD(); + + JavaRDD mergeRel = ccs + .filter(k -> k.getDocs().size() > 1) + .flatMap(cc -> ccToMergeRel(cc, dedupConf)) + .map(it -> new Relation(it._1(), it._2(), "mergeRel")); + final Dataset mergeRels = spark .createDataset( - GraphProcessor - .findCCs(vertexes.rdd(), edgeRdd, maxIterations) - .toJavaRDD() - .filter(k -> k.getDocs().size() > 1) - .flatMap(cc -> ccToMergeRel(cc, dedupConf)) - .map(it -> new Relation(it._1(), it._2(), "mergeRel")) - .rdd(), + mergeRel.rdd(), Encoders.bean(Relation.class)); mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath); diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java new file mode 100644 index 0000000..3b22e2e --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java @@ -0,0 +1,48 @@ +package eu.dnetlib.jobs; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.utils.Utility; +import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; + +import java.io.IOException; +import java.io.Serializable; + +abstract class AbstractSparkJob implements Serializable { + + protected static final int NUM_PARTITIONS = 1000; + + protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + + public ArgumentApplicationParser parser; // parameters for the spark action + public SparkSession spark; // the spark session + + public AbstractSparkJob() {} + + public AbstractSparkJob(ArgumentApplicationParser parser, SparkSession spark) { + + this.parser = parser; + this.spark = spark; + } + + abstract void run(); + + protected static SparkSession getSparkSession(SparkConf conf) { + return SparkSession.builder().config(conf).getOrCreate(); + } + + protected static void save(Dataset dataset, String outPath, SaveMode mode) { + dataset.write().option("compression", "gzip").mode(mode).json(outPath); + } + + protected static DedupConfig loadDedupConfig(String dedupConfPath) { + return DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.strict.conf.json", AbstractSparkJob.class)); + } + +} \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java new file mode 100644 index 0000000..49b75ca --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java @@ -0,0 +1,63 @@ +package eu.dnetlib.jobs; + +import eu.dnetlib.Deduper; +import eu.dnetlib.pace.utils.Utility; +import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Optional; + +public class SparkCreateDedupEntity extends AbstractSparkJob { + + private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.jobs.SparkCreateDedupEntity.class); + + public SparkCreateDedupEntity(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + Utility.readFromClasspath("/eu/dnetlib/pace/createDedupEntity_parameters.json", SparkCreateDedupEntity.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkCreateDedupEntity( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() { + + // read oozie parameters + final String entitiesPath = parser.get("entitiesPath"); + final String workingPath = parser.get("workingPath"); + final String dedupConfPath = parser.get("dedupConfPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("entitiesPath: '{}'", entitiesPath); + log.info("workingPath: '{}'", workingPath); + log.info("dedupConfPath: '{}'", dedupConfPath); + log.info("numPartitions: '{}'", numPartitions); + + Deduper.createDedupEntity( + loadDedupConfig(dedupConfPath), + workingPath + "/mergerels", + entitiesPath, + spark, + workingPath + "/dedupentity" + ); + } + +} diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateMergeRels.java b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateMergeRels.java new file mode 100644 index 0000000..02e3e41 --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateMergeRels.java @@ -0,0 +1,62 @@ +package eu.dnetlib.jobs; + +import eu.dnetlib.Deduper; +import eu.dnetlib.pace.utils.Utility; +import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Optional; + +public class SparkCreateMergeRels extends AbstractSparkJob { + + private static final Logger log = LoggerFactory.getLogger(SparkCreateMergeRels.class); + + public SparkCreateMergeRels(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + Utility.readFromClasspath("/eu/dnetlib/pace/createMergeRels_parameters.json", SparkCreateSimRels.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkCreateSimRels( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() { + + // read oozie parameters + final String entitiesPath = parser.get("entitiesPath"); + final String workingPath = parser.get("workingPath"); + final String dedupConfPath = parser.get("dedupConfPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("entitiesPath: '{}'", entitiesPath); + log.info("workingPath: '{}'", workingPath); + log.info("dedupConfPath: '{}'", dedupConfPath); + log.info("numPartitions: '{}'", numPartitions); + + Deduper.createMergeRels( + loadDedupConfig(dedupConfPath), + entitiesPath, + workingPath + "/mergerels", + workingPath + "/simrels", + spark + ); + } +} diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java new file mode 100644 index 0000000..0ffebf7 --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java @@ -0,0 +1,62 @@ +package eu.dnetlib.jobs; + +import eu.dnetlib.Deduper; +import eu.dnetlib.pace.utils.Utility; +import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Optional; + +public class SparkCreateSimRels extends AbstractSparkJob { + + private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); + + public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + Utility.readFromClasspath("/eu/dnetlib/pace/parameters/createSimRels_parameters.json", SparkCreateSimRels.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkCreateSimRels( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() { + + // read oozie parameters + final String entitiesPath = parser.get("entitiesPath"); + final String workingPath = parser.get("workingPath"); + final String dedupConfPath = parser.get("dedupConfPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("entitiesPath: '{}'", entitiesPath); + log.info("workingPath: '{}'", workingPath); + log.info("dedupConfPath: '{}'", dedupConfPath); + log.info("numPartitions: '{}'", numPartitions); + + Deduper.createSimRels( + loadDedupConfig(dedupConfPath), + spark, + entitiesPath, + workingPath + "/simrels" + ); + } +} diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java b/dnet-dedup-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java new file mode 100644 index 0000000..e2dae4a --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java @@ -0,0 +1,95 @@ +package eu.dnetlib.support; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.cli.*; +import org.apache.commons.io.IOUtils; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.Serializable; +import java.io.StringWriter; +import java.util.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +public class ArgumentApplicationParser implements Serializable { + + private final Options options = new Options(); + private final Map objectMap = new HashMap<>(); + + private final List compressedValues = new ArrayList<>(); + + public ArgumentApplicationParser(final String json_configuration) throws Exception { + final ObjectMapper mapper = new ObjectMapper(); + final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); + createOptionMap(configuration); + } + + public ArgumentApplicationParser(final OptionsParameter[] configuration) { + createOptionMap(configuration); + } + + private void createOptionMap(final OptionsParameter[] configuration) { + + Arrays + .stream(configuration) + .map( + conf -> { + final Option o = new Option(conf.getParamName(), true, conf.getParamDescription()); + o.setLongOpt(conf.getParamLongName()); + o.setRequired(conf.isParamRequired()); + if (conf.isCompressed()) { + compressedValues.add(conf.getParamLongName()); + } + return o; + }) + .forEach(options::addOption); + + // HelpFormatter formatter = new HelpFormatter(); + // formatter.printHelp("myapp", null, options, null, true); + + } + + public static String decompressValue(final String abstractCompressed) { + try { + byte[] byteArray = org.apache.commons.codec.binary.Base64.decodeBase64(abstractCompressed.getBytes()); + GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray)); + final StringWriter stringWriter = new StringWriter(); + IOUtils.copy(gis, stringWriter); + return stringWriter.toString(); + } catch (Throwable e) { + System.out.println("Wrong value to decompress:" + abstractCompressed); + throw new RuntimeException(e); + } + } + + public static String compressArgument(final String value) throws Exception { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + GZIPOutputStream gzip = new GZIPOutputStream(out); + gzip.write(value.getBytes()); + gzip.close(); + return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); + } + + public void parseArgument(final String[] args) throws Exception { + CommandLineParser parser = new BasicParser(); + CommandLine cmd = parser.parse(options, args); + Arrays + .stream(cmd.getOptions()) + .forEach( + it -> objectMap + .put( + it.getLongOpt(), + compressedValues.contains(it.getLongOpt()) + ? decompressValue(it.getValue()) + : it.getValue())); + } + + public String get(final String key) { + return objectMap.get(key); + } + + public Map getObjectMap() { + return objectMap; + } +} diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java index acbb9ec..3eff8eb 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java @@ -89,6 +89,13 @@ public class Block implements Serializable { return documents.size(); } - + @Override + public String toString() { + return "Block{" + + "key='" + key + '\'' + + ", size=" + documents.size() + '\'' + + ", names=" + documents.stream().map(d -> d.getFieldMap().get("country").stringValue()).collect(Collectors.toList()) + '\'' + + '}'; + } } diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java b/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java index 7f02c34..6f10860 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java @@ -2,6 +2,7 @@ package eu.dnetlib.support; import java.io.IOException; import java.io.Serializable; +import java.util.HashSet; import java.util.Set; import eu.dnetlib.pace.utils.Utility; @@ -14,14 +15,14 @@ import eu.dnetlib.pace.util.PaceException; public class ConnectedComponent implements Serializable { - private Set docs; + private HashSet docs; private String ccId; public ConnectedComponent() { } public ConnectedComponent(Set docs) { - this.docs = docs; + this.docs = new HashSet<>(docs); createID(); } @@ -68,7 +69,7 @@ public class ConnectedComponent implements Serializable { return docs; } - public void setDocs(Set docs) { + public void setDocs(HashSet docs) { this.docs = docs; } diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/support/OptionsParameter.java b/dnet-dedup-test/src/main/java/eu/dnetlib/support/OptionsParameter.java new file mode 100644 index 0000000..bffb523 --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/OptionsParameter.java @@ -0,0 +1,37 @@ +package eu.dnetlib.support; + +public class OptionsParameter { + + private String paramName; + private String paramLongName; + private String paramDescription; + private boolean paramRequired; + private boolean compressed; + + public OptionsParameter() { + } + + public String getParamName() { + return paramName; + } + + public String getParamLongName() { + return paramLongName; + } + + public String getParamDescription() { + return paramDescription; + } + + public boolean isParamRequired() { + return paramRequired; + } + + public boolean isCompressed() { + return compressed; + } + + public void setCompressed(boolean compressed) { + this.compressed = compressed; + } +} \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Relation.java b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Relation.java index 71d2d1b..56e4923 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Relation.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Relation.java @@ -47,4 +47,13 @@ public class Relation implements Serializable { public Edge toEdgeRdd(){ return new Edge<>(Deduper.hash(source), Deduper.hash(target), type); } + + @Override + public String toString() { + return "Relation{" + + "source='" + source + '\'' + + ", target='" + target + '\'' + + ", type='" + type + '\'' + + '}'; + } } diff --git a/dnet-dedup-test/src/main/resources/dedup/oozie_app/config-default.xml b/dnet-dedup-test/src/main/resources/dedup/oozie_app/config-default.xml new file mode 100644 index 0000000..2e0ed9a --- /dev/null +++ b/dnet-dedup-test/src/main/resources/dedup/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml b/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml new file mode 100644 index 0000000..fb8e2e2 --- /dev/null +++ b/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml @@ -0,0 +1,167 @@ + + + + entitiesPath + the input entity path + + + workingPath + path for the working directory + + + numPartitions + number of partitions for the spark files + + + dedupConfPath + path for the dedup configuration file + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + yarn + cluster + Create Similarity Relations + eu.dnetlib.jobs.SparkCreateSimRels + dnet-dedup-test-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --entitiesPath${entitiesPath} + --workingPath${workingPath} + --numPartitions${numPartitions} + --dedupConfPath${dedupConfPath} + + + + + + + + yarn + cluster + Create Merge Relations + eu.dnetlib.jobs.SparkCreateMergeRels + dhp-dedup-test-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --entitiesPath${entitiesPath} + --workingPath${workingPath} + --numPartitions${numPartitions} + --dedupConfPath${dedupConfPath} + + + + + + + + yarn + cluster + Create Dedup Entities + eu.dnetlib.jobs.SparkCreateDedupEntity + dhp-dedup-test-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --entitiesPath${entitiesPath} + --workingPath${workingPath} + --numPartitions${numPartitions} + --dedupConfPath${dedupConfPath} + + + + + + + \ No newline at end of file diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java index a5eb3cb..03c2776 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java @@ -1,8 +1,12 @@ package eu.dnetlib.pace; import eu.dnetlib.Deduper; +import eu.dnetlib.jobs.SparkCreateDedupEntity; +import eu.dnetlib.jobs.SparkCreateMergeRels; +import eu.dnetlib.jobs.SparkCreateSimRels; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.utils.Utility; +import eu.dnetlib.support.ArgumentApplicationParser; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.junit.Before; @@ -16,12 +20,15 @@ public class DedupLocalTest extends DedupTestUtils { DedupConfig config; JavaSparkContext context; - final String entitiesPath = "/Users/miconis/Desktop/publications_to_fix.json"; + final String entitiesPath = "/Users/miconis/IdeaProjects/DnetDedup/dnet-dedup/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/openorgs.to.fix.json"; + final String workingPath = "/tmp/working_dir"; + final String numPartitions = "10"; + final String dedupConfPath = "/eu/dnetlib/pace/config/organization.strict.conf.json"; @Before public void setup() { - config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/publication.current.conf.json", DedupLocalTest.class)); + config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.strict.conf.json", DedupLocalTest.class)); spark = SparkSession .builder() @@ -32,6 +39,63 @@ public class DedupLocalTest extends DedupTestUtils { } + @Test + public void createSimRelTest() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readFromClasspath("/eu/dnetlib/pace/parameters/createSimRels_parameters.json", SparkCreateSimRels.class)); + + parser.parseArgument( + new String[] { + "-e", entitiesPath, + "-w", workingPath, + "-np", numPartitions, + "-dc", dedupConfPath + }); + + new SparkCreateSimRels( + parser, + spark + ).run(); + } + + @Test + public void createMergeRelTest() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readFromClasspath("/eu/dnetlib/pace/parameters/createMergeRels_parameters.json", SparkCreateMergeRels.class)); + + parser.parseArgument( + new String[] { + "-e", entitiesPath, + "-w", workingPath, + "-np", numPartitions, + "-dc", dedupConfPath + }); + + new SparkCreateMergeRels( + parser, + spark + ).run(); + } + + @Test + public void createDedupEntityTest() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readFromClasspath("/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)); + + parser.parseArgument( + new String[] { + "-e", entitiesPath, + "-w", workingPath, + "-np", numPartitions, + "-dc", dedupConfPath + }); + + new SparkCreateDedupEntity( + parser, + spark + ).run(); + } + @Test public void deduplicationTest() { diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json index c2673d3..42744cd 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json @@ -71,6 +71,13 @@ "weight": 1, "countIfUndefined": "false", "params": {} + }, + { + "field": "rorid", + "comparator": "exactMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} } ], "threshold": 1, @@ -115,8 +122,8 @@ "aggregation": "AND", "positive": "layer3", "negative": "NO_MATCH", - "undefined": "layer3", - "ignoreUndefined": "true" + "undefined": "NO_MATCH", + "ignoreUndefined": "false" }, "layer3": { "fields": [ @@ -184,11 +191,12 @@ } }, "model" : [ - { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, - { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, - { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" }, - { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, - { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"}, + { "name" : "country", "type" : "String", "path" : "$.country.classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, + { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='GRID')].value"}, + { "name" : "rorid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='ROR')].value"}, { "name" : "originalId", "type" : "String", "path" : "$.id" } ], "blacklists" : { diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/openorgs.to.fix.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/openorgs.to.fix.json new file mode 100644 index 0000000..574335d --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/openorgs.to.fix.json @@ -0,0 +1,6 @@ +{"eclegalbody": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecresearchorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "legalname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "Hospital de Olhos Sadalla Amin Ghanem"}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "GRID", "classname": "GRID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.459901.0"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "ROR", "classname": "ROR", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "https://ror.org/015w8tk05"}], "websiteurl": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "http://www.sadalla.com.br/en/"}, "oaiprovenance": null, "logourl": null, "collectedfrom": [{"dataInfo": null, "value": "OpenOrgs Database", "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8"}], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "alternativeNames": [], "echighereducation": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "id": "20|openorgsmesh::285d56eaf89ddacbd37b3d8b5ce73110", "eclegalperson": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "lastupdatetimestamp": 1595945048456, "ecinternationalorganizationeurinterests": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "dateofcollection": "2020-07-16", "dateoftransformation": "2020-07-16", "ecnonprofit": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecenterprise": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecinternationalorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecnutscode": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "legalshortname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "Hospital de Olhos Sadalla Amin Ghanem"}, "country": {"classid": "BR", "classname": "Brazil", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "extraInfo": [], "originalId": ["openorgsmesh::0000049855-972d021b9956334e3c8ec88ebe3731e8"], "ecsmevalidated": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}} +{"eclegalbody": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecresearchorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "legalname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "Ghaem Hospital"}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "GRID", "classname": "GRID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.415529.e"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "ROR", "classname": "ROR", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "https://ror.org/05n9fs062"}], "websiteurl": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "http://www.ghaem-hospital.ir/e-n/Home.aspx"}, "oaiprovenance": null, "logourl": null, "collectedfrom": [{"dataInfo": null, "value": "OpenOrgs Database", "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8"}], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "alternativeNames": [], "echighereducation": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "id": "20|openorgsmesh::74130fd7241f2acf33de68e5c38320c0", "eclegalperson": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "lastupdatetimestamp": 1595945048456, "ecinternationalorganizationeurinterests": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "dateofcollection": "2020-07-16", "dateoftransformation": "2020-07-16", "ecnonprofit": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecenterprise": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecinternationalorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecnutscode": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "legalshortname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "Ghaem Hospital"}, "country": {"classid": "IR", "classname": "Iran (Islamic Republic of)", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "extraInfo": [], "originalId": ["openorgsmesh::0000007665-aa660bce41085d633acab2ac7564c9a3"], "ecsmevalidated": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}} +{"eclegalbody": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "ecresearchorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "legalname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "Sadalla Amin Ghanem Eye Hospital"}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "qualifier": {"classid": "GRID", "classname": "GRID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.459901.0"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "qualifier": {"classid": "ROR", "classname": "ROR", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "https://ror.org/015w8tk05"}], "websiteurl": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "http://www.sadalla.com.br/en/"}, "oaiprovenance": null, "logourl": null, "collectedfrom": [{"dataInfo": null, "value": "OpenOrgs Database", "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8"}], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "alternativeNames": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "Hospital de Olhos Sadalla Amin Ghanem"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "Sadalla Amin Ghanem Eye Hospital"}], "echighereducation": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "id": "20|openorgs____::53f1430ea6da6379fa65680f46e88578", "eclegalperson": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "lastupdatetimestamp": 1595945048456, "ecinternationalorganizationeurinterests": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "dateofcollection": "2020-07-16", "dateoftransformation": "2020-07-16", "ecnonprofit": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "ecenterprise": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "ecinternationalorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "ecnutscode": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "legalshortname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "Sadalla Amin Ghanem Eye Hospital"}, "country": {"classid": "BR", "classname": "Brazil", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "extraInfo": [], "originalId": ["openorgs____::0000049855"], "ecsmevalidated": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}} +{"eclegalbody": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecresearchorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "legalname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "Sadalla Amin Ghanem Eye Hospital"}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "GRID", "classname": "GRID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.459901.0"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "ROR", "classname": "ROR", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "https://ror.org/015w8tk05"}], "websiteurl": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "http://www.sadalla.com.br/en/"}, "oaiprovenance": null, "logourl": null, "collectedfrom": [{"dataInfo": null, "value": "OpenOrgs Database", "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8"}], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "alternativeNames": [], "echighereducation": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "id": "20|openorgsmesh::6c8abe289b5c269a554ac63360e7f1da", "eclegalperson": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "lastupdatetimestamp": 1595945048456, "ecinternationalorganizationeurinterests": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "dateofcollection": "2020-07-16", "dateoftransformation": "2020-07-16", "ecnonprofit": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecenterprise": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecinternationalorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecnutscode": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "legalshortname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "Sadalla Amin Ghanem Eye Hospital"}, "country": {"classid": "BR", "classname": "Brazil", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "extraInfo": [], "originalId": ["openorgsmesh::0000049855-9a405d988b6b2129afaa097be4e10c23"], "ecsmevalidated": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}} +{"eclegalbody": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecresearchorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "legalname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "\u0628\u06cc\u0645\u0627\u0631\u0633\u062a\u0627\u0646 \u0642\u0627\u0626\u0645"}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "GRID", "classname": "GRID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.415529.e"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "ROR", "classname": "ROR", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "https://ror.org/05n9fs062"}], "websiteurl": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "http://www.ghaem-hospital.ir/e-n/Home.aspx"}, "oaiprovenance": null, "logourl": null, "collectedfrom": [{"dataInfo": null, "value": "OpenOrgs Database", "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8"}], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "alternativeNames": [], "echighereducation": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "id": "20|openorgsmesh::0d6694cedbcb9ef04f29777911e66527", "eclegalperson": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "lastupdatetimestamp": 1595945048456, "ecinternationalorganizationeurinterests": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "dateofcollection": "2020-07-16", "dateoftransformation": "2020-07-16", "ecnonprofit": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecenterprise": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecinternationalorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "ecnutscode": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}, "legalshortname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "\u0628\u06cc\u0645\u0627\u0631\u0633\u062a\u0627\u0646 \u0642\u0627\u0626\u0645"}, "country": {"classid": "IR", "classname": "Iran (Islamic Republic of)", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "extraInfo": [], "originalId": ["openorgsmesh::0000007665-132774ec4dd8b15a8ab11036918b1e21"], "ecsmevalidated": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.5"}, "value": "false"}} +{"eclegalbody": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "ecresearchorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "legalname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "Ghaem Hospital"}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "qualifier": {"classid": "GRID", "classname": "GRID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.415529.e"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "qualifier": {"classid": "ROR", "classname": "ROR", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "https://ror.org/05n9fs062"}], "websiteurl": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "http://www.ghaem-hospital.ir/e-n/Home.aspx"}, "oaiprovenance": null, "logourl": null, "collectedfrom": [{"dataInfo": null, "value": "OpenOrgs Database", "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8"}], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "alternativeNames": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "Ghaem Hospital"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "\u0628\u06cc\u0645\u0627\u0631\u0633\u062a\u0627\u0646 \u0642\u0627\u0626\u0645"}], "echighereducation": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "id": "20|openorgs____::a006d049e5f37a53cab32dbf89137290", "eclegalperson": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "lastupdatetimestamp": 1595945048456, "ecinternationalorganizationeurinterests": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "dateofcollection": "2020-07-16", "dateoftransformation": "2020-07-16", "ecnonprofit": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "ecenterprise": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "ecinternationalorganization": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "ecnutscode": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}, "legalshortname": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "Ghaem Hospital"}, "country": {"classid": "IR", "classname": "Iran (Islamic Republic of)", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "extraInfo": [], "originalId": ["openorgs____::0000007665"], "ecsmevalidated": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.99"}, "value": "false"}} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json new file mode 100644 index 0000000..4eb30a9 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "e", + "paramLongName": "entitiesPath", + "paramDescription": "the input entities", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", + "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false + }, + { + "paramName": "dc", + "paramLongName": "dedupConfPath", + "paramDescription": "path of the dedup configuration", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createMergeRels_parameters.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createMergeRels_parameters.json new file mode 100644 index 0000000..4eb30a9 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createMergeRels_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "e", + "paramLongName": "entitiesPath", + "paramDescription": "the input entities", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", + "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false + }, + { + "paramName": "dc", + "paramLongName": "dedupConfPath", + "paramDescription": "path of the dedup configuration", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createSimRels_parameters.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createSimRels_parameters.json new file mode 100644 index 0000000..4eb30a9 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createSimRels_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "e", + "paramLongName": "entitiesPath", + "paramDescription": "the input entities", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", + "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false + }, + { + "paramName": "dc", + "paramLongName": "dedupConfPath", + "paramDescription": "path of the dedup configuration", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 530839c..b6e27fa 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -51,7 +51,7 @@ public class TreeNodeDef implements Serializable { fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats( weight, - Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "-1.0")), + Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()),