1
0
Fork 0

implementation of the sorting by trust mechanism and the merge of oaf entities

This commit is contained in:
miconis 2019-12-10 14:57:16 +01:00
parent cc63706347
commit 4b66b471a4
31 changed files with 769 additions and 1877 deletions

View File

@ -23,4 +23,23 @@ public class Context implements Serializable {
public void setDataInfo(List<DataInfo> dataInfo) { public void setDataInfo(List<DataInfo> dataInfo) {
this.dataInfo = dataInfo; this.dataInfo = dataInfo;
} }
@Override
public int hashCode() {
return id.hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Context other = (Context) obj;
return id.equals(other.getId());
}
} }

View File

@ -23,4 +23,25 @@ public class Field<T> implements Serializable {
public void setDataInfo(DataInfo dataInfo) { public void setDataInfo(DataInfo dataInfo) {
this.dataInfo = dataInfo; this.dataInfo = dataInfo;
} }
@Override
public int hashCode(){
return getValue().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Field<T> other = (Field<T>) obj;
return getValue().equals(other.getValue());
}
} }

View File

@ -33,4 +33,27 @@ public class KeyValue implements Serializable {
public void setDataInfo(DataInfo dataInfo) { public void setDataInfo(DataInfo dataInfo) {
this.dataInfo = dataInfo; this.dataInfo = dataInfo;
} }
public String toComparableString() {
return String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
}
@Override
public int hashCode() {
return toComparableString().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
KeyValue other = (KeyValue) obj;
return toComparableString().equals(other.toComparableString());
}
} }

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.*;
import java.util.stream.Collectors;
public abstract class OafEntity extends Oaf implements Serializable { public abstract class OafEntity extends Oaf implements Serializable {
@ -84,4 +85,32 @@ public abstract class OafEntity extends Oaf implements Serializable {
public void setOaiprovenance(OAIProvenance oaiprovenance) { public void setOaiprovenance(OAIProvenance oaiprovenance) {
this.oaiprovenance = oaiprovenance; this.oaiprovenance = oaiprovenance;
} }
public void mergeFrom(OafEntity e) {
if (e == null)
return;
originalId = mergeLists(originalId, e.getOriginalId());
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
pid = mergeLists(pid, e.getPid());
dateofcollection = e.getDateofcollection();
dateoftransformation = e.getDateoftransformation();
extraInfo = mergeLists(extraInfo, e.getExtraInfo());
oaiprovenance = e.getOaiprovenance();
}
protected <T> List<T> mergeLists(final List<T>... lists) {
return Arrays.stream(lists).filter(Objects::nonNull).flatMap(List::stream).distinct().collect(Collectors.toList());
}
} }

View File

@ -14,4 +14,16 @@ public class Publication extends Result implements Serializable {
public void setJournal(Journal journal) { public void setJournal(Journal journal) {
this.journal = journal; this.journal = journal;
} }
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
Publication p = (Publication) e;
if (p.getJournal() != null)
journal = p.getJournal();
}
} }

View File

@ -40,4 +40,32 @@ public class Qualifier implements Serializable {
public void setSchemename(String schemename) { public void setSchemename(String schemename) {
this.schemename = schemename; this.schemename = schemename;
} }
public String toComparableString() {
return String.format("%s::%s::%s::%s",
classid != null ? classid : "",
classname != null ? classname : "",
schemeid != null ? schemeid : "",
schemename != null ? schemename : "");
}
@Override
public int hashCode() {
return toComparableString().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Qualifier other = (Qualifier) obj;
return toComparableString()
.equals(other.toComparableString());
}
} }

View File

@ -1,7 +1,11 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Objects;
public abstract class Result extends OafEntity implements Serializable { public abstract class Result extends OafEntity implements Serializable {
@ -240,4 +244,145 @@ public abstract class Result extends OafEntity implements Serializable {
this.processingchargecurrency = processingchargecurrency; this.processingchargecurrency = processingchargecurrency;
return this; return this;
} }
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
Result r = (Result) e;
mergeAuthors(r.getAuthor());
//TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function)
// if (author == null)
// author = r.getAuthor(); //authors will be replaced because they could be too much
// dateofacceptance = r.getDateofacceptance();
// instance = mergeLists(instance, r.getInstance());
if (r.getResulttype() != null)
resulttype = r.getResulttype();
if (r.getLanguage() != null)
language = r.getLanguage();
country = mergeLists(country, r.getCountry());
subject = mergeLists(subject, r.getSubject());
title = mergeLists(title, r.getTitle());
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
description = mergeLists(description, r.getDescription());
if (r.getPublisher() != null)
publisher = r.getPublisher();
if (r.getEmbargoenddate() != null)
embargoenddate = r.getEmbargoenddate();
source = mergeLists(source, r.getSource());
fulltext = mergeLists(fulltext, r.getFulltext());
format = mergeLists(format, r.getFormat());
contributor = mergeLists(contributor, r.getContributor());
if (r.getResourcetype() != null)
resourcetype = r.getResourcetype();
coverage = mergeLists(coverage, r.getCoverage());
if (r.getRefereed() != null)
refereed = r.getRefereed();
context = mergeLists(context, r.getContext());
if (r.getProcessingchargeamount() != null)
processingchargeamount = r.getProcessingchargeamount();
if (r.getProcessingchargecurrency() != null)
processingchargecurrency = r.getProcessingchargecurrency();
externalReference = mergeLists(externalReference, r.getExternalReference());
}
public void mergeAuthors(List<Author> authors){
int c1 = countAuthorsPids(author);
int c2 = countAuthorsPids(authors);
int s1 = authorsSize(author);
int s2 = authorsSize(authors);
//if both have no authors with pids and authors is bigger than author
if (c1 == 0 && c2 == 0 && author.size()<authors.size()) {
author = authors;
return;
}
//author is null and authors have 0 or more authors with pids
if (c1<c2 && c1<0) {
author = authors;
return;
}
//andiamo a mangiare
// if (author == null && authors == null)
// return;
//
// int c1 = countAuthorsPids(author);
// int c2 = countAuthorsPids(authors);
//
// if (c1<c2 && c1<1){
// author = authors;
// return;
// }
//
// if (c1<c2)
}
public int countAuthorsPids(List<Author> authors){
if (authors == null)
return -1;
return (int) authors.stream().map(this::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count();
}
public int authorsSize(List<Author> authors){
if (authors == null)
return 0;
return authors.size();
}
public String extractAuthorPid(Author a){
if(a == null || a.getPid() == null || a.getPid().size() == 0)
return null;
StringBuilder mainPid = new StringBuilder();
a.getPid().forEach(pid ->{
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) {
mainPid.setLength(0);
mainPid.append(pid.getValue());
}
else {
if(mainPid.length() == 0)
mainPid.append(pid.getValue());
}
});
return mainPid.toString();
}
} }

View File

@ -33,4 +33,28 @@ public class StructuredProperty implements Serializable {
public void setDataInfo(DataInfo dataInfo) { public void setDataInfo(DataInfo dataInfo) {
this.dataInfo = dataInfo; this.dataInfo = dataInfo;
} }
public String toComparableString(){
return String.format("%s::%s", value != null ? value.toLowerCase() : "", qualifier != null ? qualifier.toComparableString().toLowerCase() : "");
}
@Override
public int hashCode() {
return toComparableString().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
StructuredProperty other = (StructuredProperty) obj;
return toComparableString()
.equals(other.toComparableString());
}
} }

View File

@ -0,0 +1,89 @@
package eu.dnetlib.dhp.schema.oaf;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class MergeTest {
OafEntity oaf;
@Before
public void setUp() {
oaf = new Publication();
}
@Test
public void mergeListsTest() {
//string list merge test
List<String> a = Arrays.asList("a", "b", "c", "e");
List<String> b = Arrays.asList("a", "b", "c", "d");
List<String> c = null;
System.out.println("merge result 1 = " + oaf.mergeLists(a, b));
System.out.println("merge result 2 = " + oaf.mergeLists(a, c));
System.out.println("merge result 3 = " + oaf.mergeLists(c, c));
}
@Test
public void mergePublicationCollectedFromTest() {
Publication a = new Publication();
Publication b = new Publication();
a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed")));
b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open")));
a.mergeFrom(b);
Assert.assertNotNull(a.getCollectedfrom());
Assert.assertEquals(3, a.getCollectedfrom().size());
}
@Test
public void mergePublicationSubjectTest() {
Publication a = new Publication();
Publication b = new Publication();
a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe")));
b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe")));
a.mergeFrom(b);
Assert.assertNotNull(a.getSubject());
Assert.assertEquals(3, a.getSubject().size());
}
private KeyValue setKV(final String key, final String value) {
KeyValue k = new KeyValue();
k.setKey(key);
k.setValue(value);
return k;
}
private StructuredProperty setSP(final String value, final String schema, final String classname) {
StructuredProperty s = new StructuredProperty();
s.setValue(value);
Qualifier q = new Qualifier();
q.setClassname(classname);
q.setClassid(classname);
q.setSchemename(schema);
q.setSchemeid(schema);
s.setQualifier(q);
return s;
}
}

View File

@ -9,7 +9,7 @@
<xsl:copy-of select="//oai:header"/> <xsl:copy-of select="//oai:header"/>
<metadata> <metadata>
<xsl:for-each select="//*[local-name()='subject']"> <xsl:for-each select="//*[local-name()='subject']">
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject> <subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
</xsl:for-each> </xsl:for-each>
</metadata> </metadata>
<oaf:about> <oaf:about>

View File

@ -1,11 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<RESOURCE_PROFILE> <RESOURCE_PROFILE>
<HEADER> <HEADER>
<RESOURCE_IDENTIFIER value="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/> <RESOURCE_IDENTIFIER dedupId="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
<RESOURCE_TYPE value="TransformationRuleDSResourceType"/> <RESOURCE_TYPE dedupId="TransformationRuleDSResourceType"/>
<RESOURCE_KIND value="TransformationRuleDSResources"/> <RESOURCE_KIND dedupId="TransformationRuleDSResources"/>
<RESOURCE_URI value=""/> <RESOURCE_URI dedupId=""/>
<DATE_OF_CREATION value="2019-04-11T11:15:30+00:00"/> <DATE_OF_CREATION dedupId="2019-04-11T11:15:30+00:00"/>
</HEADER> </HEADER>
<BODY> <BODY>
<CONFIGURATION> <CONFIGURATION>
@ -24,7 +24,7 @@
<xsl:copy-of select="//oai:header"/> <xsl:copy-of select="//oai:header"/>
<metadata> <metadata>
<xsl:for-each select="//*[local-name()='subject']"> <xsl:for-each select="//*[local-name()='subject']">
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject> <subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
</xsl:for-each> </xsl:for-each>
</metadata> </metadata>
<oaf:about> <oaf:about>

View File

@ -0,0 +1,169 @@
package eu.dnetlib.dedup;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Random;
import static java.util.stream.Collectors.toMap;
public class DedupRecordFactory {
public JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){
//<id, json_entity>
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
.mapToPair((PairFunction<String,String,String>) it->
new Tuple2<String, String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
);
//<source, target>: source is the dedup_id, target is the id of the mergedIn
JavaPairRDD<String,String> mergeRels = spark
.read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.mapToPair(
(PairFunction<Relation, String,String>)r->
new Tuple2<String, String>(r.getTarget(), r.getSource())
);
//<dedup_id, json_entity_merged>
final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
JavaPairRDD<OafKey, String> keyJson = joinResult.mapToPair((PairFunction<Tuple2<String, String>, OafKey, String>) json -> {
String idValue = json._1();
String trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2());
//TODO remember to replace this with the actual trust retrieving
if (StringUtils.isBlank(trust)) {
Random generator = new Random();
int number = generator.nextInt(20);
double result = (number / 100.0) + 0.80;
trust = "" + result;
}
return new Tuple2<OafKey, String>(new OafKey(idValue, trust), json._2());
});
OafComparator c = new OafComparator();
//<dedup_id, mergedRecordsSortedByTrust>
JavaPairRDD<String, Iterable<String>> sortedJoinResult = keyJson.repartitionAndSortWithinPartitions(new OafPartitioner(keyJson.getNumPartitions()), c)
.mapToPair((PairFunction<Tuple2<OafKey, String>, String, String>) t -> new Tuple2<String, String>(t._1().getDedupId(), t._2()))
.groupByKey();
switch(entityType){
case Publication:
return sortedJoinResult.map(this::publicationMerger);
case Dataset:
return sortedJoinResult.map(this::datasetMerger);
case Project:
return sortedJoinResult.map(this::projectMerger);
case Software:
return sortedJoinResult.map(this::softwareMerger);
case Datasource:
return sortedJoinResult.map(this::datasourceMerger);
case Organization:
return sortedJoinResult.map(this::organizationMerger);
case OtherResearchProduct:
return sortedJoinResult.map(this::otherresearchproductMerger);
default:
return null;
}
}
private Publication publicationMerger(Tuple2<String, Iterable<String>> e){
Publication p = new Publication(); //the result of the merge, to be returned at the end
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
final Collection<String> dateofacceptance = Lists.newArrayList();
final Collection<List<Author>> authors = Lists.newArrayList();
final Collection<List<Instance>> instances = Lists.newArrayList();
StringBuilder trust = new StringBuilder("0.0");
e._2().forEach(pub -> {
try {
Publication publication = mapper.readValue(pub, Publication.class);
final String currentTrust = publication.getDataInfo().getTrust();
if (!currentTrust.equals("1.0")) {
trust.setLength(0);
trust.append(currentTrust);
}
p.mergeFrom(publication);
//add to the list if they are not null
if (publication.getDateofacceptance() != null)
dateofacceptance.add(publication.getDateofacceptance().getValue());
if (publication.getAuthor() != null)
authors.add(publication.getAuthor());
if (publication.getInstance() != null)
instances.add(publication.getInstance());
} catch (Exception exc){}
});
p.setAuthor(null); //TODO create a single list of authors to put in the final publication
return p;
}
private Dataset datasetMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private Project projectMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private Software softwareMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private Datasource datasourceMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private Organization organizationMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
}

View File

@ -0,0 +1,15 @@
package eu.dnetlib.dedup;
import com.google.common.collect.ComparisonChain;
import java.io.Serializable;
import java.util.Comparator;
public class OafComparator implements Comparator<OafKey>, Serializable {
@Override
public int compare(OafKey a, OafKey b) {
return ComparisonChain.start()
.compare(a.getDedupId(), b.getDedupId())
.compare(a.getTrust(), b.getTrust())
.result();
}
}

View File

@ -0,0 +1,13 @@
package eu.dnetlib.dedup;
public enum OafEntityType {
Datasource,
Organization,
Project,
Dataset,
OtherResearchProduct,
Software,
Publication
}

View File

@ -0,0 +1,31 @@
package eu.dnetlib.dedup;
import java.io.Serializable;
public class OafKey implements Serializable {
private String dedupId;
private String trust;
public OafKey(String dedupId, String trust) {
this.dedupId = dedupId;
this.trust = trust;
}
public OafKey() {
}
public String getDedupId() {
return dedupId;
}
public void setDedupId(String dedupId) {
this.dedupId = dedupId;
}
public String getTrust() {
return trust;
}
public void setTrust(String trust) {
this.trust = trust;
}
@Override
public String toString(){
return String.format("%s->%d", dedupId,trust);
}
}

View File

@ -0,0 +1,59 @@
package eu.dnetlib.dedup;
import org.apache.spark.Partitioner;
import java.io.Serializable;
public class OafPartitioner extends Partitioner implements Serializable {
private final int numPartitions;
public OafPartitioner(int partitions) {
assert (partitions > 0);
this.numPartitions = partitions;
}
@Override
public int numPartitions() {
return numPartitions;
}
@Override
public int getPartition(Object key) {
if (key instanceof OafKey) {
@SuppressWarnings("unchecked")
OafKey item = (OafKey) key;
return Math.abs(item.getDedupId().hashCode() % numPartitions);
} else {
throw new IllegalArgumentException("Unexpected Key");
}
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + numPartitions;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (!(obj instanceof OafPartitioner)) {
return false;
}
//
OafPartitioner other = (OafPartitioner) obj;
if (numPartitions != other.numPartitions) {
return false;
}
//
return true;
}
}

View File

@ -37,8 +37,8 @@ public class SparkCreateConnectedComponent {
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
final String entity = parser.get("entity"); final String entity = parser.get("entity");
final String targetPath = parser.get("targetPath"); final String targetPath = parser.get("targetPath");
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); // final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final JavaPairRDD<Object, String> vertexes = sc.textFile(inputPath + "/" + entity) final JavaPairRDD<Object, String> vertexes = sc.textFile(inputPath + "/" + entity)
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dedup;
import com.google.common.collect.ComparisonChain; import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
@ -17,6 +18,7 @@ import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2; import scala.Tuple2;
import java.util.ArrayList; import java.util.ArrayList;
@ -27,26 +29,28 @@ import java.util.List;
public class SparkCreateDedupRecord { public class SparkCreateDedupRecord {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); // final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
parser.parseArgument(args); // parser.parseArgument(args);
final SparkSession spark = SparkSession // final SparkSession spark = SparkSession
.builder() // .builder()
.appName(SparkCreateDedupRecord.class.getSimpleName()) // .appName(SparkCreateDedupRecord.class.getSimpleName())
.master(parser.get("master")) // .master(parser.get("master"))
.getOrCreate(); // .getOrCreate();
//
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath"); // final String inputPath = parser.get("sourcePath");
final String entity = parser.get("entity"); // final String entity = parser.get("entity");
final String targetPath = parser.get("targetPath"); // final String targetPath = parser.get("targetPath");
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); //// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
// final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity) //
.mapToPair((PairFunction<String,String,String>)it-> // //<id, json_entity>
new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it) // final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity)
); // .mapToPair((PairFunction<String,String,String>)it->
// new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
// );
// //<source, target>: source is the dedup_id, target is the id of the mergedIn
// JavaPairRDD<String,String> mergeRels = spark // JavaPairRDD<String,String> mergeRels = spark
// .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class)) // .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class))
// .where("relClass=='merges'") // .where("relClass=='merges'")
@ -56,46 +60,12 @@ public class SparkCreateDedupRecord {
// new Tuple2<String,String>(r.getTarget(), r.getSource()) // new Tuple2<String,String>(r.getTarget(), r.getSource())
// ); // );
// //
// // //<dedup_id, json_entity_merged>
// final JavaPairRDD<String, String> p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2); // final JavaPairRDD<String, String> p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
//
// Comparator<String> c = new Comparator<String>() {
// @Override
// public int compare(String s, String t1) {
// return 0;
// }
// };
// final JavaPairRDD<String, String> stringStringJavaPairRDD = p.repartitionAndSortWithinPartitions(p.partitioner().get(), c);
StructType schema = Encoders.bean(Publication.class).schema();
// List<Foo> inputValues = Arrays.asList( System.out.println(schema);
// new Foo("k",5),
// new Foo("a",1),
// new Foo("a",30),
// new Foo("a",18),
// new Foo("a",22),
// new Foo("b",22),
// new Foo("c",5),
// new Foo("a",5),
// new Foo("s",1),
// new Foo("h",4)
// );
//
//
// final JavaPairRDD<Foo, Foo> fooFighters = sc.parallelize(inputValues).mapToPair((PairFunction<Foo, Foo, Foo>) i -> new Tuple2<Foo, Foo>(i, i));
//
//
// FooComparator c = new FooComparator();
// final List<Tuple2<String, List<Foo>>> result =
// fooFighters.repartitionAndSortWithinPartitions(new FooPartitioner(fooFighters.getNumPartitions()), c)
// .mapToPair((PairFunction<Tuple2<Foo, Foo>, String, Foo>) t-> new Tuple2<String,Foo>(t._1().getValue(), t._2()) )
// .groupByKey()
// .mapValues((Function<Iterable<Foo>, List<Foo>>) Lists::newArrayList)
// .collect();
//
//
// System.out.println(result);
} }
} }

View File

@ -44,7 +44,8 @@ public class SparkCreateSimRels {
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
final String entity = parser.get("entity"); final String entity = parser.get("entity");
final String targetPath = parser.get("targetPath"); final String targetPath = parser.get("targetPath");
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); // final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final long total = sc.textFile(inputPath + "/" + entity).count(); final long total = sc.textFile(inputPath + "/" + entity).count();

View File

@ -1,7 +1,5 @@
package eu.dnetlib.dedup.graph package eu.dnetlib.dedup.graph
import eu.dnetlib.pace.model.MapDocument
import org.apache.spark.graphx._ import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
@ -25,7 +23,7 @@ object GraphProcessor {
} }
val connectedComponents = joinResult.groupByKey() val connectedComponents = joinResult.groupByKey()
.map[ConnectedComponent](cc => asConnectedComponent(cc)) .map[ConnectedComponent](cc => asConnectedComponent(cc))
(connectedComponents) connectedComponents
} }

View File

@ -1,5 +1,7 @@
package eu.dnetlib.dedup; package eu.dnetlib.dedup;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Publication;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.Before; import org.junit.Before;
@ -8,36 +10,37 @@ import org.junit.Test;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.List;
public class SparkCreateDedupTest { public class SparkCreateDedupTest {
String configuration;
@Before @Before
public void setUp() throws IOException { public void setUp() throws IOException {
FileUtils.deleteDirectory(new File("/tmp/pub_dedup_vertex")); configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub.curr.conf.json"));
FileUtils.deleteDirectory(new File("/tmp/pub_dedup_rels"));
} }
@Test @Test
@Ignore @Ignore
public void dedupTest() throws Exception { public void createSimRelsTest() throws Exception {
final String configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
SparkCreateSimRels.main(new String[] { SparkCreateSimRels.main(new String[] {
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/home/sandro/betadump", "-s", "/Users/miconis/dumps",
"-e", "publication", "-e", "publication",
"-c", configuration, "-c", configuration,
"-t", "/tmp/dedup", "-t", "/tmp/dedup",
}); });
}
@Test
@Ignore
public void createCCTest() throws Exception {
SparkCreateConnectedComponent.main(new String[] { SparkCreateConnectedComponent.main(new String[] {
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/home/sandro/betadump", "-s", "/Users/miconis/dumps",
"-e", "publication", "-e", "publication",
"-c", configuration, "-c", configuration,
"-t", "/tmp/dedup", "-t", "/tmp/dedup",
@ -49,14 +52,10 @@ public class SparkCreateDedupTest {
public void dedupRecordTest() throws Exception { public void dedupRecordTest() throws Exception {
SparkCreateDedupRecord.main(new String[] { SparkCreateDedupRecord.main(new String[] {
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/home/sandro/betadump", "-s", "/Users/miconis/dumps",
"-e", "publication", "-e", "publication",
"-c", "configuration", "-c", configuration,
"-t", "/tmp/dedup", "-t", "/tmp/dedup",
}); });
} }
} }

View File

@ -7,7 +7,7 @@
"queueMaxSize": "2000", "queueMaxSize": "2000",
"groupMaxSize": "50", "groupMaxSize": "50",
"slidingWindowSize": "200", "slidingWindowSize": "200",
"idPath": "$.id", "idPath": ".id",
"rootBuilder": [ "rootBuilder": [
"organization", "organization",
"projectOrganization_participation_isParticipant", "projectOrganization_participation_isParticipant",
@ -117,14 +117,6 @@
"host": 0.5, "host": 0.5,
"path": 0.5 "path": 0.5
} }
},
{
"name": "gridid",
"algo": "Null",
"type": "String",
"weight": "0.0",
"ignoreMissing": "true",
"path": ".pid[] | select(.qualifier.classid==\"grid\") | .value"
} }
], ],
"blacklists": { "blacklists": {

View File

@ -1,18 +1,18 @@
<configuration> <configuration>
<property> <property>
<name>jobTracker</name> <name>jobTracker</name>
<value>yarnRM</value> <dedupId>yarnRM</dedupId>
</property> </property>
<property> <property>
<name>nameNode</name> <name>nameNode</name>
<value>hdfs://nameservice1</value> <dedupId>hdfs://nameservice1</dedupId>
</property> </property>
<property> <property>
<name>sourceNN</name> <name>sourceNN</name>
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value> <dedupId>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</dedupId>
</property> </property>
<property> <property>
<name>oozie.use.system.libpath</name> <name>oozie.use.system.libpath</name>
<value>true</value> <dedupId>true</dedupId>
</property> </property>
</configuration> </configuration>

View File

@ -14,12 +14,12 @@
</property> </property>
<property> <property>
<name>hbase_dump_distcp_memory_mb</name> <name>hbase_dump_distcp_memory_mb</name>
<value>6144</value> <dedupId>6144</dedupId>
<description>memory for distcp action copying InfoSpace dump from remote cluster</description> <description>memory for distcp action copying InfoSpace dump from remote cluster</description>
</property> </property>
<property> <property>
<name>hbase_dump_distcp_num_maps</name> <name>hbase_dump_distcp_num_maps</name>
<value>1</value> <dedupId>1</dedupId>
<description>maximum number of simultaneous copies of InfoSpace dump from remote location</description> <description>maximum number of simultaneous copies of InfoSpace dump from remote location</description>
</property> </property>
</parameters> </parameters>

View File

@ -1,26 +1,26 @@
<configuration> <configuration>
<property> <property>
<name>jobTracker</name> <name>jobTracker</name>
<value>yarnRM</value> <dedupId>yarnRM</dedupId>
</property> </property>
<property> <property>
<name>nameNode</name> <name>nameNode</name>
<value>hdfs://nameservice1</value> <dedupId>hdfs://nameservice1</dedupId>
</property> </property>
<property> <property>
<name>oozie.use.system.libpath</name> <name>oozie.use.system.libpath</name>
<value>true</value> <dedupId>true</dedupId>
</property> </property>
<property> <property>
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <dedupId>spark2</dedupId>
</property> </property>
<property> <property>
<name>hive_metastore_uris</name> <name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value> <dedupId>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</dedupId>
</property> </property>
<property> <property>
<name>hive_db_name</name> <name>hive_db_name</name>
<value>openaire</value> <dedupId>openaire</dedupId>
</property> </property>
</configuration> </configuration>

View File

@ -54,7 +54,7 @@ Properties overriding order is the following:
2. `~/.dhp/application.properties` defined properties 2. `~/.dhp/application.properties` defined properties
3. `${workflow.source.dir}/job.properties` 3. `${workflow.source.dir}/job.properties`
4. `job-override.properties` (located in the project root dir) 4. `job-override.properties` (located in the project root dir)
5. `maven -Dparam=value` 5. `maven -Dparam=dedupId`
where the maven `-Dparam` property is overriding all the other ones. where the maven `-Dparam` property is overriding all the other ones.
@ -73,7 +73,7 @@ Workflow definition requirements
This property can be set using maven `-D` switch. This property can be set using maven `-D` switch.
`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value. `[oozie_app]` is the default directory name however it can be set to any dedupId as soon as `oozieAppDir` property is provided with directory name as dedupId.
Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory. Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory.

View File

@ -73,7 +73,7 @@
<!-- This profile sets properties that are required for test oozie workflows To be used only with 'oozie-package' profile --> <!-- This profile sets properties that are required for test oozie workflows To be used only with 'oozie-package' profile -->
<id>attach-test-resources</id> <id>attach-test-resources</id>
<properties> <properties>
<!--overriding default scope (set to 'runtime') with the 'test' value. Test resources attached to oozie package requires all test dependencies. --> <!--overriding default scope (set to 'runtime') with the 'test' dedupId. Test resources attached to oozie package requires all test dependencies. -->
<oozie.package.dependencies.include.scope /> <oozie.package.dependencies.include.scope />
<oozie.package.dependencies.exclude.scope>provided</oozie.package.dependencies.exclude.scope> <oozie.package.dependencies.exclude.scope>provided</oozie.package.dependencies.exclude.scope>
<!-- Do not skip creation of test jar for priming (in oozie-package profile) --> <!-- Do not skip creation of test jar for priming (in oozie-package profile) -->
@ -326,7 +326,7 @@
</goals> </goals>
<configuration> <configuration>
<tasks> <tasks>
<property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" /> <property name="assembly-resources.loc" dedupId="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" /> <unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
</tasks> </tasks>
</configuration> </configuration>