From abd9034da06d6eb9df16fe93a6688fb00045e2af Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Wed, 11 Dec 2019 15:43:24 +0100 Subject: [PATCH] implemented DedupRecord factory with the merge of publications --- .gitignore | 3 +- dhp-common/pom.xml | 10 +- .../ArgumentApplicationParser.java | 41 +++- .../dhp/application/OptionsParameter.java | 9 + .../ArgumentApplicationParserTest.java | 10 + .../eu/dnetlib/application/parameters.json | 21 +- .../eu/dnetlib/dhp/schema/oaf/Instance.java | 27 +++ .../eu/dnetlib/dhp/schema/oaf/Result.java | 84 +------- .../eu/dnetlib/dhp/transform/ext_simple.xsl | 2 +- .../resources/eu/dnetlib/dhp/transform/tr.xml | 12 +- .../java/eu/dnetlib/dedup/DatePicker.java | 119 +++++++++++ .../eu/dnetlib/dedup/DedupRecordFactory.java | 83 ++++---- .../java/eu/dnetlib/dedup/DedupUtility.java | 196 +++++++++++++++++- .../java/eu/dnetlib/dedup/OafEntityType.java | 16 +- .../dedup/SparkCreateConnectedComponent.java | 4 +- .../dnetlib/dedup/SparkCreateDedupRecord.java | 74 ++----- .../eu/dnetlib/dedup/SparkCreateSimRels.java | 2 +- .../dhp/dedup/dedupRecord_parameters.json | 33 +++ .../dnetlib/dhp/dedup/dedup_parameters.json | 36 +++- .../dhp/dedup/oozie_app/config-default.xml | 17 +- .../dnetlib/dhp/dedup/oozie_app/workflow.xml | 25 +++ .../eu/dnetlib/dedup/MergeAuthorTest.java | 61 ++++++ .../dnetlib/dedup/SparkCreateDedupTest.java | 23 +- .../eu/dnetlib/dedup/json/authors_merge.json | 3 + .../dhp/distcp/oozie_app/config-default.xml | 8 +- .../dnetlib/dhp/distcp/oozie_app/workflow.xml | 4 +- .../dhp/graph/oozie_app/config-default.xml | 12 +- dhp-workflows/docs/oozie-installer.markdown | 4 +- dhp-workflows/pom.xml | 4 +- pom.xml | 5 + 30 files changed, 686 insertions(+), 262 deletions(-) create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json create mode 100644 dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java create mode 100644 dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json diff --git a/.gitignore b/.gitignore index 486eacee9..3f00d9729 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,5 @@ /*/build /build spark-warehouse -/dhp-workflows/dhp-graph-mapper/job-override.properties +/*/*/job-override.properties + diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 6fac06b68..43c2a3834 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -17,6 +17,10 @@ commons-cli commons-cli + + commons-io + commons-io + org.apache.commons commons-lang3 @@ -29,21 +33,15 @@ javax.persistence javax.persistence-api - com.fasterxml.jackson.core jackson-databind - com.rabbitmq amqp-client - - commons-io - commons-io - diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java index a4970a928..cbfc5caf1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java @@ -2,17 +2,25 @@ package eu.dnetlib.dhp.application; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.cli.*; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.io.IOUtils; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.Serializable; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; +import java.io.StringWriter; +import java.util.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; +import java.util.zip.Inflater; public class ArgumentApplicationParser implements Serializable { private final Options options = new Options(); private final Map objectMap = new HashMap<>(); + private final List compressedValues = new ArrayList<>(); + public ArgumentApplicationParser(final String json_configuration) throws Exception { final ObjectMapper mapper = new ObjectMapper(); final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); @@ -29,6 +37,9 @@ public class ArgumentApplicationParser implements Serializable { final Option o = new Option(conf.getParamName(), true, conf.getParamDescription()); o.setLongOpt(conf.getParamLongName()); o.setRequired(conf.isParamRequired()); + if (conf.isCompressed()) { + compressedValues.add(conf.getParamLongName()); + } return o; }).forEach(options::addOption); @@ -38,10 +49,32 @@ public class ArgumentApplicationParser implements Serializable { } + + public static String decompressValue(final String abstractCompressed) { + try { + byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes()); + GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray)); + final StringWriter stringWriter = new StringWriter(); + IOUtils.copy(gis, stringWriter); + return stringWriter.toString(); + } catch (Throwable e) { + System.out.println("Wrong value to decompress:" + abstractCompressed); + throw new RuntimeException(e); + } + } + + public static String compressArgument(final String value) throws Exception{ + ByteArrayOutputStream out = new ByteArrayOutputStream(); + GZIPOutputStream gzip = new GZIPOutputStream(out); + gzip.write(value.getBytes()); + gzip.close(); + return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); + } + public void parseArgument(final String[] args) throws Exception { CommandLineParser parser = new BasicParser(); CommandLine cmd = parser.parse(options, args); - Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), it.getValue())); + Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), compressedValues.contains(it.getLongOpt())? decompressValue(it.getValue()): it.getValue())); } public String get(final String key) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java index 92079fce7..4e7c2826b 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java @@ -7,6 +7,7 @@ public class OptionsParameter { private String paramLongName; private String paramDescription; private boolean paramRequired; + private boolean compressed; public OptionsParameter() { } @@ -26,4 +27,12 @@ public class OptionsParameter { public boolean isParamRequired() { return paramRequired; } + + public boolean isCompressed() { + return compressed; + } + + public void setCompressed(boolean compressed) { + this.compressed = compressed; + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java index 2033919b9..fdea3c2d4 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java @@ -3,6 +3,10 @@ package eu.dnetlib.dhp.application; import org.apache.commons.io.IOUtils; import org.junit.Test; +import java.io.ByteArrayOutputStream; +import java.util.Base64; +import java.util.zip.GZIPOutputStream; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -24,6 +28,7 @@ public class ArgumentApplicationParserTest { "-ro", "value7", "-rr", "value8", "-w", "value9", + "-cc", ArgumentApplicationParser.compressArgument(jsonConfiguration) }); assertNotNull(parser.get("hdfsPath")); assertNotNull(parser.get("apidescriptor")); @@ -45,7 +50,12 @@ public class ArgumentApplicationParserTest { assertEquals("value7", parser.get("rabbitOngoingQueue")); assertEquals("value8", parser.get("rabbitReportQueue")); assertEquals("value9", parser.get("workflowId")); + assertEquals(jsonConfiguration, parser.get("ccCoco")); } + + + + } diff --git a/dhp-common/src/test/resources/eu/dnetlib/application/parameters.json b/dhp-common/src/test/resources/eu/dnetlib/application/parameters.json index 60c2d391a..13c199166 100644 --- a/dhp-common/src/test/resources/eu/dnetlib/application/parameters.json +++ b/dhp-common/src/test/resources/eu/dnetlib/application/parameters.json @@ -1,12 +1,13 @@ [ - {"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true}, - {"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true}, - {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true}, - {"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true}, - {"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true}, - {"paramName":"rp", "paramLongName":"rabbitPassWord", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true}, - {"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true}, - {"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true}, - {"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true} + {"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true}, + {"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true}, + {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true}, + {"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true}, + {"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true}, + {"paramName":"rp", "paramLongName":"rabbitPassWord", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true}, + {"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true}, + {"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true}, + {"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true}, + {"paramName":"cc", "paramLongName":"ccCoco", "paramDescription": "the identifier of the dnet Workflow", "compressed":true,"paramRequired": true} ] \ No newline at end of file diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index f27704c5c..dc5ac61e8 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -84,4 +84,31 @@ public class Instance implements Serializable { public void setDateofacceptance(Field dateofacceptance) { this.dateofacceptance = dateofacceptance; } + public String toComparableString(){ + return String.format("%s::%s::%s::%s", + hostedby != null && hostedby.getKey()!= null ? hostedby.getKey().toLowerCase() : "", + accessright!= null && accessright.getClassid()!= null ? accessright.getClassid() : "", + instancetype!= null && instancetype.getClassid()!= null ? instancetype.getClassid() : "", + url != null ? url:""); + } + + @Override + public int hashCode() { + return toComparableString().hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + + Instance other = (Instance) obj; + + return toComparableString() + .equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index 0e34d8ba6..1bb7f6a67 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -6,6 +6,8 @@ import java.io.Serializable; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Optional; +import java.util.stream.Collectors; public abstract class Result extends OafEntity implements Serializable { @@ -251,13 +253,12 @@ public abstract class Result extends OafEntity implements Serializable { Result r = (Result) e; - mergeAuthors(r.getAuthor()); + //TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function) -// if (author == null) -// author = r.getAuthor(); //authors will be replaced because they could be too much // dateofacceptance = r.getDateofacceptance(); -// instance = mergeLists(instance, r.getInstance()); + + instance = mergeLists(instance, r.getInstance()); if (r.getResulttype() != null) resulttype = r.getResulttype(); @@ -309,80 +310,5 @@ public abstract class Result extends OafEntity implements Serializable { } - public void mergeAuthors(List authors){ - int c1 = countAuthorsPids(author); - int c2 = countAuthorsPids(authors); - int s1 = authorsSize(author); - int s2 = authorsSize(authors); - - //if both have no authors with pids and authors is bigger than author - if (c1 == 0 && c2 == 0 && author.size() authors){ - if (authors == null) - return -1; - - return (int) authors.stream().map(this::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count(); - } - - public int authorsSize(List authors){ - if (authors == null) - return 0; - return authors.size(); - } - - public String extractAuthorPid(Author a){ - - if(a == null || a.getPid() == null || a.getPid().size() == 0) - return null; - - StringBuilder mainPid = new StringBuilder(); - - a.getPid().forEach(pid ->{ - if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) { - mainPid.setLength(0); - mainPid.append(pid.getValue()); - } - else { - if(mainPid.length() == 0) - mainPid.append(pid.getValue()); - } - }); - - return mainPid.toString(); - - } } diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl index 5f5ed5a3b..cef50aa95 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl @@ -9,7 +9,7 @@ - + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml index ef6d9f7ac..a9eae8576 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml @@ -1,11 +1,11 @@
- - - - - + + + + +
@@ -24,7 +24,7 @@ - + diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java new file mode 100644 index 000000000..73f178edc --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java @@ -0,0 +1,119 @@ +package eu.dnetlib.dedup; + +import eu.dnetlib.dhp.schema.oaf.Field; +import org.apache.commons.lang.StringUtils; + +import java.time.Year; +import java.util.*; +import java.util.stream.Collectors; + +import static java.util.Collections.reverseOrder; +import static java.util.Map.Entry.comparingByValue; +import static java.util.stream.Collectors.toMap; +import static org.apache.commons.lang.StringUtils.endsWith; +import static org.apache.commons.lang.StringUtils.substringBefore; + +public class DatePicker { + + private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; + private static final String DATE_DEFAULT_SUFFIX = "01-01"; + private static final int YEAR_LB = 1300; + private static final int YEAR_UB = Year.now().getValue() + 5; + + public static Field pick(final Collection dateofacceptance) { + + final Map frequencies = dateofacceptance + .parallelStream() + .filter(StringUtils::isNotBlank) + .collect( + Collectors.toConcurrentMap( + w -> w, w -> 1, Integer::sum)); + + if (frequencies.isEmpty()) { + return new Field<>(); + } + + final Field date = new Field<>(); + date.setValue(frequencies.keySet().iterator().next()); + + // let's sort this map by values first, filtering out invalid dates + final Map sorted = frequencies + .entrySet() + .stream() + .filter(d -> StringUtils.isNotBlank(d.getKey())) + .filter(d -> d.getKey().matches(DATE_PATTERN)) + .filter(d -> inRange(d.getKey())) + .sorted(reverseOrder(comparingByValue())) + .collect( + toMap( + Map.Entry::getKey, + Map.Entry::getValue, (e1, e2) -> e2, + LinkedHashMap::new)); + + // shortcut + if (sorted.size() == 0) { + return date; + } + + // voting method (1/3 + 1) wins + if (sorted.size() >= 3) { + final int acceptThreshold = (sorted.size() / 3) + 1; + final List accepted = sorted.entrySet().stream() + .filter(e -> e.getValue() >= acceptThreshold) + .map(e -> e.getKey()) + .collect(Collectors.toList()); + + // cannot find strong majority + if (accepted.isEmpty()) { + final int max = sorted.values().iterator().next(); + Optional first = sorted.entrySet().stream() + .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) + .map(Map.Entry::getKey) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } + + date.setValue(sorted.keySet().iterator().next()); + return date; + } + + if (accepted.size() == 1) { + date.setValue(accepted.get(0)); + return date; + } else { + final Optional first = accepted.stream() + .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } + + return date; + } + + //1st non YYYY-01-01 is returned + } else { + if (sorted.size() == 2) { + for (Map.Entry e : sorted.entrySet()) { + if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { + date.setValue(e.getKey()); + return date; + } + } + } + + // none of the dates seems good enough, return the 1st one + date.setValue(sorted.keySet().iterator().next()); + return date; + } + } + + private static boolean inRange(final String date) { + final int year = Integer.parseInt(substringBefore(date, "-")); + return year >= YEAR_LB && year <= YEAR_UB; + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java index 371e80349..000be640a 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java @@ -1,6 +1,5 @@ package eu.dnetlib.dedup; -import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.pace.config.DedupConfig; @@ -10,23 +9,20 @@ import org.apache.commons.lang.StringUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.codehaus.jackson.map.ObjectMapper; import scala.Tuple2; -import java.io.IOException; import java.util.Collection; -import java.util.List; import java.util.Random; import static java.util.stream.Collectors.toMap; public class DedupRecordFactory { - public JavaRDD createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){ + public static JavaRDD createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){ // final JavaPairRDD inputJsonEntities = sc.textFile(entitiesInputPath) @@ -51,7 +47,12 @@ public class DedupRecordFactory { String idValue = json._1(); - String trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2()); + String trust =""; + try { + trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2()); + } catch (Throwable e) { + + } //TODO remember to replace this with the actual trust retrieving if (StringUtils.isBlank(trust)) { @@ -71,28 +72,32 @@ public class DedupRecordFactory { .groupByKey(); + + + + switch(entityType){ - case Publication: - return sortedJoinResult.map(this::publicationMerger); - case Dataset: - return sortedJoinResult.map(this::datasetMerger); - case Project: - return sortedJoinResult.map(this::projectMerger); - case Software: - return sortedJoinResult.map(this::softwareMerger); - case Datasource: - return sortedJoinResult.map(this::datasourceMerger); - case Organization: - return sortedJoinResult.map(this::organizationMerger); - case OtherResearchProduct: - return sortedJoinResult.map(this::otherresearchproductMerger); + case publication: + return sortedJoinResult.map(DedupRecordFactory::publicationMerger); + case dataset: + return sortedJoinResult.map(DedupRecordFactory::datasetMerger); + case project: + return sortedJoinResult.map(DedupRecordFactory::projectMerger); + case software: + return sortedJoinResult.map(DedupRecordFactory::softwareMerger); + case datasource: + return sortedJoinResult.map(DedupRecordFactory::datasourceMerger); + case organization: + return sortedJoinResult.map(DedupRecordFactory::organizationMerger); + case otherresearchproduct: + return sortedJoinResult.map(DedupRecordFactory::otherresearchproductMerger); default: return null; } } - private Publication publicationMerger(Tuple2> e){ + private static Publication publicationMerger(Tuple2> e){ Publication p = new Publication(); //the result of the merge, to be returned at the end @@ -101,67 +106,59 @@ public class DedupRecordFactory { final ObjectMapper mapper = new ObjectMapper(); final Collection dateofacceptance = Lists.newArrayList(); - final Collection> authors = Lists.newArrayList(); - final Collection> instances = Lists.newArrayList(); + StringBuilder trust = new StringBuilder("0.0"); + if (e._2() != null) e._2().forEach(pub -> { try { Publication publication = mapper.readValue(pub, Publication.class); final String currentTrust = publication.getDataInfo().getTrust(); - if (!currentTrust.equals("1.0")) { + if (!"1.0".equals(currentTrust)) { trust.setLength(0); trust.append(currentTrust); } - p.mergeFrom(publication); - + p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor())); //add to the list if they are not null if (publication.getDateofacceptance() != null) dateofacceptance.add(publication.getDateofacceptance().getValue()); - if (publication.getAuthor() != null) - authors.add(publication.getAuthor()); - if (publication.getInstance() != null) - instances.add(publication.getInstance()); - - } catch (Exception exc){} - + } catch (Exception exc){ + throw new RuntimeException(exc); + } }); - - p.setAuthor(null); //TODO create a single list of authors to put in the final publication - - + p.setDateofacceptance(DatePicker.pick(dateofacceptance)); return p; } - private Dataset datasetMerger(Tuple2> e){ + private static Dataset datasetMerger(Tuple2> e){ throw new NotImplementedException(); } - private Project projectMerger(Tuple2> e){ + private static Project projectMerger(Tuple2> e){ throw new NotImplementedException(); } - private Software softwareMerger(Tuple2> e){ + private static Software softwareMerger(Tuple2> e){ throw new NotImplementedException(); } - private Datasource datasourceMerger(Tuple2> e){ + private static Datasource datasourceMerger(Tuple2> e){ throw new NotImplementedException(); } - private Organization organizationMerger(Tuple2> e){ + private static Organization organizationMerger(Tuple2> e){ throw new NotImplementedException(); } - private OtherResearchProduct otherresearchproductMerger(Tuple2> e){ + private static OtherResearchProduct otherresearchproductMerger(Tuple2> e){ throw new NotImplementedException(); } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java index b65e866f1..388ab9b69 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java @@ -1,11 +1,17 @@ package eu.dnetlib.dedup; import com.google.common.collect.Sets; +import com.wcohen.ss.JaroWinkler; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; + import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.Person; import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -14,32 +20,35 @@ import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; +import scala.Tuple2; import java.io.IOException; import java.io.StringWriter; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; +import java.text.Normalizer; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; public class DedupUtility { + private static final Double THRESHOLD = 0.95; public static Map constructAccumulator(final DedupConfig dedupConf, final SparkContext context) { Map accumulators = new HashMap<>(); - String acc1 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"); + String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); + String acc2 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); + String acc3 = String.format("%s::%s", dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); accumulators.put(acc3, context.longAccumulator(acc3)); - String acc4 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list"); + String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); + String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); + String acc6 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); accumulators.put(acc6, context.longAccumulator(acc6)); return accumulators; @@ -52,7 +61,7 @@ public class DedupUtility { public static void deleteIfExists(String path) throws IOException { Configuration conf = new Configuration(); FileSystem fileSystem = FileSystem.get(conf); - if (fileSystem.exists(new Path(path))){ + if (fileSystem.exists(new Path(path))) { fileSystem.delete(new Path(path), true); } } @@ -91,4 +100,171 @@ public class DedupUtility { return null; } } + + + public static List mergeAuthor(final List a, final List b) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base, enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); + + if(pa == pb){ + base = sa>sb?a:b; + enrich = sa>sb?b:a; + } else { + base = pa>pb?a:b; + enrich = pa>pb?b:a; + } + enrichPidFromList(base, enrich); + return base; + + + +// //if both have no authors with pids +// if (pa < 1 && pb < 1) { +// //B is bigger than A +// if (sa < sb) +// return b; +// //A is bigger than B +// else +// return a; +// } +// //If A has author with pids +// if (pa > 0) { +// //B has no author with pid +// if (pb < 1) +// return a; +// //B has author with pid +// else { +// enrichPidFromList(a, b); +// return a; +// } +// } +// //If B has author with pids +// //A has no author with pid +// if (pa < 1) +// return b; +// //A has author with pid +// else { +// enrichPidFromList(b, a); +// return b; +// } + } + + private static void enrichPidFromList(List base, List enrich) { + if(base==null || enrich == null) + return; + final Map basePidAuthorMap = base.stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap(a -> a.getPid() + .stream() + .map(p -> new Tuple2<>(p.toComparableString(), a)) + ).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); + + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap(a -> a.getPid().stream().filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())).map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); + + + pidToEnrich.forEach(a -> { + Optional> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1)); + if (simAuhtor.isPresent() && simAuhtor.get()._1()> THRESHOLD) { + Author r = simAuhtor.get()._2(); + r.getPid().add(a._1()); + } + }); + } + + public static String createEntityPath(final String basePath, final String entityType) { + return String.format("%s/%s", basePath,entityType); + } + + public static String createSimRelPath(final String basePath, final String entityType) { + return String.format("%s/%s_simRel", basePath,entityType); + } + + public static String createMergeRelPath(final String basePath, final String entityType) { + return String.format("%s/%s_mergeRel", basePath,entityType); + } + + private static Double sim(Author a, Author b) { + + final Person pa = parse(a); + final Person pb = parse(b); + + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler().score( + normalize(pa.getSurnameString()), + normalize(pb.getSurnameString())); + } else { + return new JaroWinkler().score( + normalize(pa.getNormalisedFullname()), + normalize(pb.getNormalisedFullname())); + } + } + + private static String normalize(final String s) { + return nfd(s).toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } + + + private static int countAuthorsPids(List authors) { + if (authors == null) + return 0; + + return (int) authors.stream().map(DedupUtility::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count(); + } + + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } + + + private static boolean isAccurate(final Author a) { + return StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname()); + } + + private static String extractAuthorPid(Author a) { + + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return null; + + StringBuilder mainPid = new StringBuilder(); + + a.getPid().forEach(pid -> { + if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) { + mainPid.setLength(0); + mainPid.append(pid.getValue()); + } else { + if (mainPid.length() == 0) + mainPid.append(pid.getValue()); + } + }); + + return mainPid.toString(); + + } } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java index 4ff2fa873..fb347ed51 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java @@ -2,12 +2,14 @@ package eu.dnetlib.dedup; public enum OafEntityType { - Datasource, - Organization, - Project, - Dataset, - OtherResearchProduct, - Software, - Publication + datasource, + organization, + project, + dataset, + otherresearchproduct, + software, + publication + + } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java index eacf3d479..9783e93d6 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java @@ -46,7 +46,7 @@ public class SparkCreateConnectedComponent { s -> new Tuple2((long) s.hashCode(), s) ); - final Dataset similarityRelations = spark.read().load(targetPath + "/" + entity+"_simrel").as(Encoders.bean(Relation.class)); + final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class)); final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(it.getSource().hashCode(), it.getTarget().hashCode(), it.getRelClass())).rdd(); @@ -73,7 +73,7 @@ public class SparkCreateConnectedComponent { return tmp.stream(); }).iterator()).rdd(), Encoders.bean(Relation.class)); - mergeRelation.write().mode("overwrite").save(targetPath+"/"+entity+"_mergeRels"); + mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity)); } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java index 56bdc20f1..db2306526 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java @@ -1,71 +1,39 @@ package eu.dnetlib.dedup; -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Lists; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; import org.apache.commons.io.IOUtils; -import org.apache.spark.Partitioner; -import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.Optional; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.StructType; -import scala.Tuple2; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; public class SparkCreateDedupRecord { public static void main(String[] args) throws Exception { -// final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); -// parser.parseArgument(args); -// final SparkSession spark = SparkSession -// .builder() -// .appName(SparkCreateDedupRecord.class.getSimpleName()) -// .master(parser.get("master")) -// .getOrCreate(); -// -// final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); -// final String inputPath = parser.get("sourcePath"); -// final String entity = parser.get("entity"); -// final String targetPath = parser.get("targetPath"); -//// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); -// final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); -// -// // -// final JavaPairRDD inputJsonEntities = sc.textFile(inputPath + "/" + entity) -// .mapToPair((PairFunction)it-> -// new Tuple2(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it) -// ); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkCreateDedupRecord.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); -// //: source is the dedup_id, target is the id of the mergedIn -// JavaPairRDD mergeRels = spark -// .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class)) -// .where("relClass=='merges'") -// .javaRDD() -// .mapToPair( -// (PairFunction)r-> -// new Tuple2(r.getTarget(), r.getSource()) -// ); -// -// // -// final JavaPairRDD p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction>, String, String>) Tuple2::_2); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String sourcePath = parser.get("sourcePath"); + final String entity = parser.get("entity"); + final String dedupPath = parser.get("dedupPath"); +// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); + final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); + + final JavaRDD dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf); + dedupRecord.map(r-> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(r); + }).saveAsTextFile(dedupPath+"/"+entity+"_dedup_record_json"); - StructType schema = Encoders.bean(Publication.class).schema(); - System.out.println(schema); } } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java index 2ae85baf3..48d442d04 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java @@ -77,7 +77,7 @@ public class SparkCreateSimRels { return r; }); - spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save(targetPath+"/"+entity+"_simrel"); + spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity)); diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json new file mode 100644 index 000000000..de744dfb6 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json @@ -0,0 +1,33 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the path of the sequential file to read", + "paramRequired": true + }, + { + "paramName": "e", + "paramLongName": "entity", + "paramDescription": "the type of entity to be deduped", + "paramRequired": true + }, + { + "paramName": "c", + "paramLongName": "dedupConf", + "paramDescription": "dedup configuration to be used", + "compressed": true, + "paramRequired": true + }, + { + "paramName": "d", + "paramLongName": "dedupPath", + "paramDescription": "dedup path to load mergeRelation", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json index d9a0dc8b9..8ba8515d0 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json @@ -1,7 +1,33 @@ [ - {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, - {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequential file to read", "paramRequired": true}, - {"paramName":"e", "paramLongName":"entity", "paramDescription": "the type of entity to be deduped", "paramRequired": true}, - {"paramName":"c", "paramLongName":"dedupConf", "paramDescription": "dedup configuration to be used", "paramRequired": true}, - {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "target path to save dedup result", "paramRequired": true} + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the path of the sequential file to read", + "paramRequired": true + }, + { + "paramName": "e", + "paramLongName": "entity", + "paramDescription": "the type of entity to be deduped", + "paramRequired": true + }, + { + "paramName": "c", + "paramLongName": "dedupConf", + "paramDescription": "dedup configuration to be used", + "compressed": true, + "paramRequired": true + }, + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "target path to save dedup result", + "paramRequired": true + } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml index e654bbbb6..fcab9dd00 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml @@ -1,31 +1,26 @@ jobTracker - yarnRM + yarnRM nameNode - hdfs://nameservice1 + hdfs://nameservice1 oozie.use.system.libpath - true + true oozie.action.sharelib.for.spark - spark2 + spark2 hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 hive_db_name - openaire + openaire - - dedupConf - {"wf":{"threshold":"0.9","dedupRun":"001","entityType":"organization","orderField":"legalname","queueMaxSize":"2000","groupMaxSize":"50","slidingWindowSize":"200","idPath":".id","rootBuilder":["organization","projectOrganization_participation_isParticipant","datasourceOrganization_provision_isProvidedBy"],"includeChildren":"true"},"pace":{"clustering":[{"name":"sortedngrampairs","fields":["legalname"],"params":{"max":2,"ngramLen":"3"}},{"name":"suffixprefix","fields":["legalname"],"params":{"max":1,"len":"3"}},{"name":"urlclustering","fields":["websiteurl"],"params":{}},{"name":"keywordsclustering","fields":["legalname"],"params":{"max":2,"windowSize":4}}],"strictConditions":[{"name":"exactMatch","fields":["gridid"]}],"conditions":[{"name":"DomainExactMatch","fields":["websiteurl"]},{"name":"exactMatch","fields":["country"]}],"model":[{"name":"country","algo":"Null","type":"String","weight":"0","ignoreMissing":"false","path":".country.classid"},{"name":"legalshortname","algo":"JaroWinklerNormalizedName","type":"String","weight":"0.1","ignoreMissing":"true","path":".legalshortname.dedupId"},{"name":"legalname","algo":"JaroWinklerNormalizedName","type":"String","weight":"0.9","ignoreMissing":"false","path":".legalname.dedupId","params":{"windowSize":4,"threshold":0.7}},{"name":"websiteurl","algo":"Null","type":"URL","weight":"0","ignoreMissing":"true","path":".websiteurl.dedupId","params":{"host":0.5,"path":0.5}},{"name":"gridid","algo":"Null","type":"String","weight":"0.0","ignoreMissing":"true","path":".pid[] | select(.qualifier.classid==\"grid\") | .dedupId"}],"blacklists":{"legalname":[]},"synonyms":{"key::1":["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],"key::2":["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],"key::3":["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],"key::4":["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],"key::5":["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],"key::6":["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],"key::7":["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],"key::8":["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],"key::9":["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],"key::10":["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],"key::11":["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],"key::12":["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],"key::13":["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],"key::14":["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],"key::15":["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],"key::16":["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],"key::17":["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],"key::18":["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],"key::19":["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],"key::20":["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],"key::21":["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],"key::22":["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],"key::23":["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],"key::24":["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],"key::25":["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],"key::26":["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],"key::27":["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],"key::28":["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],"key::29":["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],"key::30":["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],"key::31":["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],"key::32":["industry","industria","industrie","индустрия","industrie","βιομηχανία"],"key::33":["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],"key::34":["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],"key::35":["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],"key::36":["authority","autorità","autorité","авторитет","autoriteit"],"key::37":["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],"key::38":["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],"key::39":["bureau","ufficio","bureau","офис","bureau","γραφείο"],"key::40":["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],"key::41":["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],"key::42":["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],"key::43":["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],"key::44":["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],"key::45":["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],"key::46":["division","divisione","division","отделение","divisie","τμήμα"],"key::47":["committee","comitato","comité","комитет","commissie","επιτροπή"],"key::48":["promotion","promozione","продвижение","proothisis","forderung"],"key::49":["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],"key::50":["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],"key::51":["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],"key::52":["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],"key::53":["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],"key::54":["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],"key::55":["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],"key::56":["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],"key::57":["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],"key::58":["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],"key::59":["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],"key::60":["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],"key::61":["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],"key::62":["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],"key::63":["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],"key::64":["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],"key::65":["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],"key::66":["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],"key::67":["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],"key::68":["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],"key::69":["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],"key::70":["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],"key::71":["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],"key::72":["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],"key::73":["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],"key::74":["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],"key::75":["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],"key::76":["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],"key::77":["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],"key::78":["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],"key::79":["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],"key::80":["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],"key::81":["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],"key::82":["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],"key::83":["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],"key::84":["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],"key::85":["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],"key::86":["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],"key::87":["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],"key::88":["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],"key::89":["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],"key::90":["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],"key::91":["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],"key::92":["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],"key::93":["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],"key::94":["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],"key::95":["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],"key::96":["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],"key::97":["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],"key::98":["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],"key::99":["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],"key::100":["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],"key::101":["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],"key::102":["informatics","informatica","informática","informática","informatica",""],"key::103":["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],"key::104":["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]}}} - - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml index 7f9de8af5..09dd3a315 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml @@ -93,6 +93,31 @@ --entity${entity} --dedupConf${dedupConf} + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Create Connected Components + eu.dnetlib.dedup.SparkCreateDedupRecord + dhp-dedup-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} --conf + spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf + spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf + spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --sourcePath${sourcePath} + --dedupPath${dedupPath} + --entity${entity} + --dedupConf${dedupConf} + diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java new file mode 100644 index 000000000..817f2075c --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java @@ -0,0 +1,61 @@ +package eu.dnetlib.dedup; + +import eu.dnetlib.dhp.schema.oaf.Publication; +import org.apache.commons.io.IOUtils; +import org.codehaus.jackson.map.ObjectMapper; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +public class MergeAuthorTest { + + List publicationsToMerge; + final ObjectMapper mapper = new ObjectMapper(); + + @Before + public void setUp() throws Exception { + final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dedup/json/authors_merge.json")); + + + publicationsToMerge = Arrays.asList(json.split("\n")).stream().map(s-> { + try { + return mapper.readValue(s, Publication.class); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toList()); + + + + } + + + @Test + public void test() throws Exception { + Publication dedup = new Publication(); + + + publicationsToMerge.forEach(p-> { + dedup.mergeFrom(p); + dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(),p.getAuthor())); + }); + + + + + + + + + System.out.println(mapper.writeValueAsString(dedup)); + + + } + + + +} diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java index f35baa9f8..5d5576dd8 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java @@ -1,6 +1,7 @@ package eu.dnetlib.dedup; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Publication; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; @@ -27,9 +28,9 @@ public class SparkCreateDedupTest { public void createSimRelsTest() throws Exception { SparkCreateSimRels.main(new String[] { "-mt", "local[*]", - "-s", "/Users/miconis/dumps", + "-s", "/home/sandro/betadump", "-e", "publication", - "-c", configuration, + "-c", ArgumentApplicationParser.compressArgument(configuration), "-t", "/tmp/dedup", }); } @@ -40,9 +41,9 @@ public class SparkCreateDedupTest { SparkCreateConnectedComponent.main(new String[] { "-mt", "local[*]", - "-s", "/Users/miconis/dumps", + "-s", "/home/sandro/betadump", "-e", "publication", - "-c", configuration, + "-c", ArgumentApplicationParser.compressArgument(configuration), "-t", "/tmp/dedup", }); } @@ -52,10 +53,18 @@ public class SparkCreateDedupTest { public void dedupRecordTest() throws Exception { SparkCreateDedupRecord.main(new String[] { "-mt", "local[*]", - "-s", "/Users/miconis/dumps", + "-s", "/home/sandro/betadump", "-e", "publication", - "-c", configuration, - "-t", "/tmp/dedup", + "-c", ArgumentApplicationParser.compressArgument(configuration), + "-d", "/tmp/dedup", }); } + + @Test + public void printCC() throws Exception { + System.out.println(ArgumentApplicationParser.compressArgument(configuration)); + } + + + } diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json new file mode 100644 index 000000000..4e8b66d1b --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json @@ -0,0 +1,3 @@ +{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[{"fullname":"Nemkov, Pavel G.","name":"","surname":"","rank":1,"pid":[],"affiliation":[]},{"fullname":"Gayubo, Severiano F.","name":"","surname":"","rank":2,"pid":[{"value":"ORCID1","qualifier":{"classid":"orcid","classname":"orcid","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]},{"fullname":"Ciccio Pasticcio","name":"","surname":"","rank":2,"pid":[],"affiliation":[]}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} +{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[{"fullname":"Nemkov, Pavel G.","name":"","surname":"","rank":1,"pid":[{"value":"ORCIDDIO","qualifier":{"classid":"ORCID","classname":"ORCID","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]},{"fullname":"Gayubo, Severiano F.","name":"","surname":"","rank":2,"pid":[{"value":"MAGGLES","qualifier":{"classid":"mag","classname":"mag","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} +{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} diff --git a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml index 292ec14c0..905fb9984 100644 --- a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml @@ -1,18 +1,18 @@ jobTracker - yarnRM + yarnRM nameNode - hdfs://nameservice1 + hdfs://nameservice1 sourceNN - webhdfs://namenode2.hadoop.dm.openaire.eu:50071 + webhdfs://namenode2.hadoop.dm.openaire.eu:50071 oozie.use.system.libpath - true + true \ No newline at end of file diff --git a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml index 5fe802118..91b97332b 100644 --- a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml @@ -14,12 +14,12 @@ hbase_dump_distcp_memory_mb - 6144 + 6144 memory for distcp action copying InfoSpace dump from remote cluster hbase_dump_distcp_num_maps - 1 + 1 maximum number of simultaneous copies of InfoSpace dump from remote location diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml index abac9bba2..fcab9dd00 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml @@ -1,26 +1,26 @@ jobTracker - yarnRM + yarnRM nameNode - hdfs://nameservice1 + hdfs://nameservice1 oozie.use.system.libpath - true + true oozie.action.sharelib.for.spark - spark2 + spark2 hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 hive_db_name - openaire + openaire \ No newline at end of file diff --git a/dhp-workflows/docs/oozie-installer.markdown b/dhp-workflows/docs/oozie-installer.markdown index 90360ec35..b1953a54e 100644 --- a/dhp-workflows/docs/oozie-installer.markdown +++ b/dhp-workflows/docs/oozie-installer.markdown @@ -54,7 +54,7 @@ Properties overriding order is the following: 2. `~/.dhp/application.properties` defined properties 3. `${workflow.source.dir}/job.properties` 4. `job-override.properties` (located in the project root dir) -5. `maven -Dparam=dedupId` +5. `maven -Dparam=value` where the maven `-Dparam` property is overriding all the other ones. @@ -73,7 +73,7 @@ Workflow definition requirements This property can be set using maven `-D` switch. -`[oozie_app]` is the default directory name however it can be set to any dedupId as soon as `oozieAppDir` property is provided with directory name as dedupId. +`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value. Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory. diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 65227a782..bd2ca9704 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -73,7 +73,7 @@ attach-test-resources - + provided @@ -326,7 +326,7 @@ - + diff --git a/pom.xml b/pom.xml index 5d3dce34a..cc39e46f2 100644 --- a/pom.xml +++ b/pom.xml @@ -236,6 +236,11 @@ java-jq 0.10.1 + + edu.cmu + secondstring + 1.0.0 + org.apache.oozie