forked from antonis.lempesis/dnet-hadoop
implementation of the sorting by trust mechanism and the merge of oaf entities
This commit is contained in:
parent
cc63706347
commit
4b66b471a4
|
@ -23,4 +23,23 @@ public class Context implements Serializable {
|
|||
public void setDataInfo(List<DataInfo> dataInfo) {
|
||||
this.dataInfo = dataInfo;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return id.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
|
||||
Context other = (Context) obj;
|
||||
|
||||
return id.equals(other.getId());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,4 +23,25 @@ public class Field<T> implements Serializable {
|
|||
public void setDataInfo(DataInfo dataInfo) {
|
||||
this.dataInfo = dataInfo;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode(){
|
||||
return getValue().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
|
||||
Field<T> other = (Field<T>) obj;
|
||||
|
||||
return getValue().equals(other.getValue());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -33,4 +33,27 @@ public class KeyValue implements Serializable {
|
|||
public void setDataInfo(DataInfo dataInfo) {
|
||||
this.dataInfo = dataInfo;
|
||||
}
|
||||
|
||||
public String toComparableString() {
|
||||
return String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return toComparableString().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
|
||||
KeyValue other = (KeyValue) obj;
|
||||
|
||||
return toComparableString().equals(other.toComparableString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public abstract class OafEntity extends Oaf implements Serializable {
|
||||
|
||||
|
@ -84,4 +85,32 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
|||
public void setOaiprovenance(OAIProvenance oaiprovenance) {
|
||||
this.oaiprovenance = oaiprovenance;
|
||||
}
|
||||
|
||||
public void mergeFrom(OafEntity e) {
|
||||
|
||||
if (e == null)
|
||||
return;
|
||||
|
||||
originalId = mergeLists(originalId, e.getOriginalId());
|
||||
|
||||
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
|
||||
|
||||
pid = mergeLists(pid, e.getPid());
|
||||
|
||||
dateofcollection = e.getDateofcollection();
|
||||
|
||||
dateoftransformation = e.getDateoftransformation();
|
||||
|
||||
extraInfo = mergeLists(extraInfo, e.getExtraInfo());
|
||||
|
||||
oaiprovenance = e.getOaiprovenance();
|
||||
|
||||
}
|
||||
|
||||
protected <T> List<T> mergeLists(final List<T>... lists) {
|
||||
|
||||
return Arrays.stream(lists).filter(Objects::nonNull).flatMap(List::stream).distinct().collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -14,4 +14,16 @@ public class Publication extends Result implements Serializable {
|
|||
public void setJournal(Journal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mergeFrom(OafEntity e) {
|
||||
super.mergeFrom(e);
|
||||
|
||||
Publication p = (Publication) e;
|
||||
|
||||
if (p.getJournal() != null)
|
||||
journal = p.getJournal();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -40,4 +40,32 @@ public class Qualifier implements Serializable {
|
|||
public void setSchemename(String schemename) {
|
||||
this.schemename = schemename;
|
||||
}
|
||||
|
||||
public String toComparableString() {
|
||||
return String.format("%s::%s::%s::%s",
|
||||
classid != null ? classid : "",
|
||||
classname != null ? classname : "",
|
||||
schemeid != null ? schemeid : "",
|
||||
schemename != null ? schemename : "");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return toComparableString().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
|
||||
Qualifier other = (Qualifier) obj;
|
||||
|
||||
return toComparableString()
|
||||
.equals(other.toComparableString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
public abstract class Result extends OafEntity implements Serializable {
|
||||
|
||||
|
@ -240,4 +244,145 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.processingchargecurrency = processingchargecurrency;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mergeFrom(OafEntity e) {
|
||||
super.mergeFrom(e);
|
||||
|
||||
Result r = (Result) e;
|
||||
|
||||
mergeAuthors(r.getAuthor());
|
||||
|
||||
//TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function)
|
||||
// if (author == null)
|
||||
// author = r.getAuthor(); //authors will be replaced because they could be too much
|
||||
// dateofacceptance = r.getDateofacceptance();
|
||||
// instance = mergeLists(instance, r.getInstance());
|
||||
|
||||
if (r.getResulttype() != null)
|
||||
resulttype = r.getResulttype();
|
||||
|
||||
if (r.getLanguage() != null)
|
||||
language = r.getLanguage();
|
||||
|
||||
country = mergeLists(country, r.getCountry());
|
||||
|
||||
subject = mergeLists(subject, r.getSubject());
|
||||
|
||||
title = mergeLists(title, r.getTitle());
|
||||
|
||||
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
|
||||
|
||||
description = mergeLists(description, r.getDescription());
|
||||
|
||||
if (r.getPublisher() != null)
|
||||
publisher = r.getPublisher();
|
||||
|
||||
if (r.getEmbargoenddate() != null)
|
||||
embargoenddate = r.getEmbargoenddate();
|
||||
|
||||
source = mergeLists(source, r.getSource());
|
||||
|
||||
fulltext = mergeLists(fulltext, r.getFulltext());
|
||||
|
||||
format = mergeLists(format, r.getFormat());
|
||||
|
||||
contributor = mergeLists(contributor, r.getContributor());
|
||||
|
||||
if (r.getResourcetype() != null)
|
||||
resourcetype = r.getResourcetype();
|
||||
|
||||
coverage = mergeLists(coverage, r.getCoverage());
|
||||
|
||||
if (r.getRefereed() != null)
|
||||
refereed = r.getRefereed();
|
||||
|
||||
context = mergeLists(context, r.getContext());
|
||||
|
||||
if (r.getProcessingchargeamount() != null)
|
||||
processingchargeamount = r.getProcessingchargeamount();
|
||||
|
||||
if (r.getProcessingchargecurrency() != null)
|
||||
processingchargecurrency = r.getProcessingchargecurrency();
|
||||
|
||||
externalReference = mergeLists(externalReference, r.getExternalReference());
|
||||
|
||||
}
|
||||
|
||||
public void mergeAuthors(List<Author> authors){
|
||||
int c1 = countAuthorsPids(author);
|
||||
int c2 = countAuthorsPids(authors);
|
||||
int s1 = authorsSize(author);
|
||||
int s2 = authorsSize(authors);
|
||||
|
||||
|
||||
//if both have no authors with pids and authors is bigger than author
|
||||
if (c1 == 0 && c2 == 0 && author.size()<authors.size()) {
|
||||
author = authors;
|
||||
return;
|
||||
}
|
||||
|
||||
//author is null and authors have 0 or more authors with pids
|
||||
if (c1<c2 && c1<0) {
|
||||
author = authors;
|
||||
return;
|
||||
}
|
||||
|
||||
//andiamo a mangiare
|
||||
|
||||
|
||||
// if (author == null && authors == null)
|
||||
// return;
|
||||
//
|
||||
// int c1 = countAuthorsPids(author);
|
||||
// int c2 = countAuthorsPids(authors);
|
||||
//
|
||||
// if (c1<c2 && c1<1){
|
||||
// author = authors;
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// if (c1<c2)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
public int countAuthorsPids(List<Author> authors){
|
||||
if (authors == null)
|
||||
return -1;
|
||||
|
||||
return (int) authors.stream().map(this::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count();
|
||||
}
|
||||
|
||||
public int authorsSize(List<Author> authors){
|
||||
if (authors == null)
|
||||
return 0;
|
||||
return authors.size();
|
||||
}
|
||||
|
||||
public String extractAuthorPid(Author a){
|
||||
|
||||
if(a == null || a.getPid() == null || a.getPid().size() == 0)
|
||||
return null;
|
||||
|
||||
StringBuilder mainPid = new StringBuilder();
|
||||
|
||||
a.getPid().forEach(pid ->{
|
||||
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) {
|
||||
mainPid.setLength(0);
|
||||
mainPid.append(pid.getValue());
|
||||
}
|
||||
else {
|
||||
if(mainPid.length() == 0)
|
||||
mainPid.append(pid.getValue());
|
||||
}
|
||||
});
|
||||
|
||||
return mainPid.toString();
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,4 +33,28 @@ public class StructuredProperty implements Serializable {
|
|||
public void setDataInfo(DataInfo dataInfo) {
|
||||
this.dataInfo = dataInfo;
|
||||
}
|
||||
|
||||
public String toComparableString(){
|
||||
return String.format("%s::%s", value != null ? value.toLowerCase() : "", qualifier != null ? qualifier.toComparableString().toLowerCase() : "");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return toComparableString().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
|
||||
StructuredProperty other = (StructuredProperty) obj;
|
||||
|
||||
return toComparableString()
|
||||
.equals(other.toComparableString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class MergeTest {
|
||||
|
||||
OafEntity oaf;
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
oaf = new Publication();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mergeListsTest() {
|
||||
|
||||
//string list merge test
|
||||
List<String> a = Arrays.asList("a", "b", "c", "e");
|
||||
List<String> b = Arrays.asList("a", "b", "c", "d");
|
||||
List<String> c = null;
|
||||
|
||||
System.out.println("merge result 1 = " + oaf.mergeLists(a, b));
|
||||
|
||||
System.out.println("merge result 2 = " + oaf.mergeLists(a, c));
|
||||
|
||||
System.out.println("merge result 3 = " + oaf.mergeLists(c, c));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mergePublicationCollectedFromTest() {
|
||||
|
||||
Publication a = new Publication();
|
||||
Publication b = new Publication();
|
||||
|
||||
a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed")));
|
||||
b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open")));
|
||||
|
||||
a.mergeFrom(b);
|
||||
|
||||
Assert.assertNotNull(a.getCollectedfrom());
|
||||
Assert.assertEquals(3, a.getCollectedfrom().size());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mergePublicationSubjectTest() {
|
||||
|
||||
Publication a = new Publication();
|
||||
Publication b = new Publication();
|
||||
|
||||
a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe")));
|
||||
b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe")));
|
||||
|
||||
a.mergeFrom(b);
|
||||
|
||||
Assert.assertNotNull(a.getSubject());
|
||||
Assert.assertEquals(3, a.getSubject().size());
|
||||
|
||||
}
|
||||
|
||||
private KeyValue setKV(final String key, final String value) {
|
||||
|
||||
KeyValue k = new KeyValue();
|
||||
|
||||
k.setKey(key);
|
||||
k.setValue(value);
|
||||
|
||||
return k;
|
||||
}
|
||||
|
||||
private StructuredProperty setSP(final String value, final String schema, final String classname) {
|
||||
StructuredProperty s = new StructuredProperty();
|
||||
s.setValue(value);
|
||||
Qualifier q = new Qualifier();
|
||||
q.setClassname(classname);
|
||||
q.setClassid(classname);
|
||||
q.setSchemename(schema);
|
||||
q.setSchemeid(schema);
|
||||
s.setQualifier(q);
|
||||
return s;
|
||||
}
|
||||
}
|
|
@ -9,7 +9,7 @@
|
|||
<xsl:copy-of select="//oai:header"/>
|
||||
<metadata>
|
||||
<xsl:for-each select="//*[local-name()='subject']">
|
||||
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
|
||||
<subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
|
||||
</xsl:for-each>
|
||||
</metadata>
|
||||
<oaf:about>
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<RESOURCE_PROFILE>
|
||||
<HEADER>
|
||||
<RESOURCE_IDENTIFIER value="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
|
||||
<RESOURCE_TYPE value="TransformationRuleDSResourceType"/>
|
||||
<RESOURCE_KIND value="TransformationRuleDSResources"/>
|
||||
<RESOURCE_URI value=""/>
|
||||
<DATE_OF_CREATION value="2019-04-11T11:15:30+00:00"/>
|
||||
<RESOURCE_IDENTIFIER dedupId="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
|
||||
<RESOURCE_TYPE dedupId="TransformationRuleDSResourceType"/>
|
||||
<RESOURCE_KIND dedupId="TransformationRuleDSResources"/>
|
||||
<RESOURCE_URI dedupId=""/>
|
||||
<DATE_OF_CREATION dedupId="2019-04-11T11:15:30+00:00"/>
|
||||
</HEADER>
|
||||
<BODY>
|
||||
<CONFIGURATION>
|
||||
|
@ -24,7 +24,7 @@
|
|||
<xsl:copy-of select="//oai:header"/>
|
||||
<metadata>
|
||||
<xsl:for-each select="//*[local-name()='subject']">
|
||||
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
|
||||
<subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
|
||||
</xsl:for-each>
|
||||
</metadata>
|
||||
<oaf:about>
|
||||
|
|
|
@ -0,0 +1,169 @@
|
|||
package eu.dnetlib.dedup;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.apache.commons.lang.NotImplementedException;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import static java.util.stream.Collectors.toMap;
|
||||
|
||||
public class DedupRecordFactory {
|
||||
|
||||
public JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){
|
||||
|
||||
//<id, json_entity>
|
||||
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
|
||||
.mapToPair((PairFunction<String,String,String>) it->
|
||||
new Tuple2<String, String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
|
||||
);
|
||||
|
||||
//<source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||
JavaPairRDD<String,String> mergeRels = spark
|
||||
.read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class))
|
||||
.where("relClass=='merges'")
|
||||
.javaRDD()
|
||||
.mapToPair(
|
||||
(PairFunction<Relation, String,String>)r->
|
||||
new Tuple2<String, String>(r.getTarget(), r.getSource())
|
||||
);
|
||||
|
||||
//<dedup_id, json_entity_merged>
|
||||
final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||
|
||||
JavaPairRDD<OafKey, String> keyJson = joinResult.mapToPair((PairFunction<Tuple2<String, String>, OafKey, String>) json -> {
|
||||
|
||||
String idValue = json._1();
|
||||
|
||||
String trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2());
|
||||
|
||||
//TODO remember to replace this with the actual trust retrieving
|
||||
if (StringUtils.isBlank(trust)) {
|
||||
Random generator = new Random();
|
||||
int number = generator.nextInt(20);
|
||||
double result = (number / 100.0) + 0.80;
|
||||
trust = "" + result;
|
||||
}
|
||||
|
||||
return new Tuple2<OafKey, String>(new OafKey(idValue, trust), json._2());
|
||||
});
|
||||
|
||||
OafComparator c = new OafComparator();
|
||||
//<dedup_id, mergedRecordsSortedByTrust>
|
||||
JavaPairRDD<String, Iterable<String>> sortedJoinResult = keyJson.repartitionAndSortWithinPartitions(new OafPartitioner(keyJson.getNumPartitions()), c)
|
||||
.mapToPair((PairFunction<Tuple2<OafKey, String>, String, String>) t -> new Tuple2<String, String>(t._1().getDedupId(), t._2()))
|
||||
.groupByKey();
|
||||
|
||||
|
||||
switch(entityType){
|
||||
case Publication:
|
||||
return sortedJoinResult.map(this::publicationMerger);
|
||||
case Dataset:
|
||||
return sortedJoinResult.map(this::datasetMerger);
|
||||
case Project:
|
||||
return sortedJoinResult.map(this::projectMerger);
|
||||
case Software:
|
||||
return sortedJoinResult.map(this::softwareMerger);
|
||||
case Datasource:
|
||||
return sortedJoinResult.map(this::datasourceMerger);
|
||||
case Organization:
|
||||
return sortedJoinResult.map(this::organizationMerger);
|
||||
case OtherResearchProduct:
|
||||
return sortedJoinResult.map(this::otherresearchproductMerger);
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Publication publicationMerger(Tuple2<String, Iterable<String>> e){
|
||||
|
||||
Publication p = new Publication(); //the result of the merge, to be returned at the end
|
||||
|
||||
p.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||
final Collection<List<Author>> authors = Lists.newArrayList();
|
||||
final Collection<List<Instance>> instances = Lists.newArrayList();
|
||||
|
||||
StringBuilder trust = new StringBuilder("0.0");
|
||||
|
||||
e._2().forEach(pub -> {
|
||||
try {
|
||||
Publication publication = mapper.readValue(pub, Publication.class);
|
||||
|
||||
final String currentTrust = publication.getDataInfo().getTrust();
|
||||
if (!currentTrust.equals("1.0")) {
|
||||
trust.setLength(0);
|
||||
trust.append(currentTrust);
|
||||
}
|
||||
|
||||
p.mergeFrom(publication);
|
||||
|
||||
//add to the list if they are not null
|
||||
if (publication.getDateofacceptance() != null)
|
||||
dateofacceptance.add(publication.getDateofacceptance().getValue());
|
||||
if (publication.getAuthor() != null)
|
||||
authors.add(publication.getAuthor());
|
||||
if (publication.getInstance() != null)
|
||||
instances.add(publication.getInstance());
|
||||
|
||||
} catch (Exception exc){}
|
||||
|
||||
});
|
||||
|
||||
p.setAuthor(null); //TODO create a single list of authors to put in the final publication
|
||||
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
private Dataset datasetMerger(Tuple2<String, Iterable<String>> e){
|
||||
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
private Project projectMerger(Tuple2<String, Iterable<String>> e){
|
||||
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
private Software softwareMerger(Tuple2<String, Iterable<String>> e){
|
||||
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
private Datasource datasourceMerger(Tuple2<String, Iterable<String>> e){
|
||||
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
private Organization organizationMerger(Tuple2<String, Iterable<String>> e){
|
||||
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
private OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e){
|
||||
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
package eu.dnetlib.dedup;
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import java.io.Serializable;
|
||||
import java.util.Comparator;
|
||||
|
||||
public class OafComparator implements Comparator<OafKey>, Serializable {
|
||||
|
||||
@Override
|
||||
public int compare(OafKey a, OafKey b) {
|
||||
return ComparisonChain.start()
|
||||
.compare(a.getDedupId(), b.getDedupId())
|
||||
.compare(a.getTrust(), b.getTrust())
|
||||
.result();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
package eu.dnetlib.dedup;
|
||||
|
||||
public enum OafEntityType {
|
||||
|
||||
Datasource,
|
||||
Organization,
|
||||
Project,
|
||||
Dataset,
|
||||
OtherResearchProduct,
|
||||
Software,
|
||||
Publication
|
||||
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package eu.dnetlib.dedup;
|
||||
|
||||
import java.io.Serializable;
|
||||
public class OafKey implements Serializable {
|
||||
|
||||
private String dedupId;
|
||||
private String trust;
|
||||
|
||||
public OafKey(String dedupId, String trust) {
|
||||
this.dedupId = dedupId;
|
||||
this.trust = trust;
|
||||
}
|
||||
public OafKey() {
|
||||
}
|
||||
public String getDedupId() {
|
||||
return dedupId;
|
||||
}
|
||||
public void setDedupId(String dedupId) {
|
||||
this.dedupId = dedupId;
|
||||
}
|
||||
public String getTrust() {
|
||||
return trust;
|
||||
}
|
||||
public void setTrust(String trust) {
|
||||
this.trust = trust;
|
||||
}
|
||||
@Override
|
||||
public String toString(){
|
||||
return String.format("%s->%d", dedupId,trust);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package eu.dnetlib.dedup;
|
||||
|
||||
import org.apache.spark.Partitioner;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class OafPartitioner extends Partitioner implements Serializable {
|
||||
|
||||
private final int numPartitions;
|
||||
|
||||
public OafPartitioner(int partitions) {
|
||||
assert (partitions > 0);
|
||||
this.numPartitions = partitions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numPartitions() {
|
||||
return numPartitions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPartition(Object key) {
|
||||
if (key instanceof OafKey) {
|
||||
@SuppressWarnings("unchecked")
|
||||
OafKey item = (OafKey) key;
|
||||
return Math.abs(item.getDedupId().hashCode() % numPartitions);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unexpected Key");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + numPartitions;
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (!(obj instanceof OafPartitioner)) {
|
||||
return false;
|
||||
}
|
||||
//
|
||||
OafPartitioner other = (OafPartitioner) obj;
|
||||
if (numPartitions != other.numPartitions) {
|
||||
return false;
|
||||
}
|
||||
//
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -37,8 +37,8 @@ public class SparkCreateConnectedComponent {
|
|||
final String inputPath = parser.get("sourcePath");
|
||||
final String entity = parser.get("entity");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||
|
||||
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||
|
||||
final JavaPairRDD<Object, String> vertexes = sc.textFile(inputPath + "/" + entity)
|
||||
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dedup;
|
|||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
|
@ -17,6 +18,7 @@ import org.apache.spark.api.java.function.PairFunction;
|
|||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -27,26 +29,28 @@ import java.util.List;
|
|||
public class SparkCreateDedupRecord {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkCreateDedupRecord.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String entity = parser.get("entity");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||
|
||||
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity)
|
||||
.mapToPair((PairFunction<String,String,String>)it->
|
||||
new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
|
||||
);
|
||||
|
||||
// final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
|
||||
// parser.parseArgument(args);
|
||||
// final SparkSession spark = SparkSession
|
||||
// .builder()
|
||||
// .appName(SparkCreateDedupRecord.class.getSimpleName())
|
||||
// .master(parser.get("master"))
|
||||
// .getOrCreate();
|
||||
//
|
||||
// final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
// final String inputPath = parser.get("sourcePath");
|
||||
// final String entity = parser.get("entity");
|
||||
// final String targetPath = parser.get("targetPath");
|
||||
//// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||
// final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||
//
|
||||
// //<id, json_entity>
|
||||
// final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity)
|
||||
// .mapToPair((PairFunction<String,String,String>)it->
|
||||
// new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
|
||||
// );
|
||||
|
||||
// //<source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||
// JavaPairRDD<String,String> mergeRels = spark
|
||||
// .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class))
|
||||
// .where("relClass=='merges'")
|
||||
|
@ -56,46 +60,12 @@ public class SparkCreateDedupRecord {
|
|||
// new Tuple2<String,String>(r.getTarget(), r.getSource())
|
||||
// );
|
||||
//
|
||||
//
|
||||
// //<dedup_id, json_entity_merged>
|
||||
// final JavaPairRDD<String, String> p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||
//
|
||||
// Comparator<String> c = new Comparator<String>() {
|
||||
// @Override
|
||||
// public int compare(String s, String t1) {
|
||||
// return 0;
|
||||
// }
|
||||
// };
|
||||
// final JavaPairRDD<String, String> stringStringJavaPairRDD = p.repartitionAndSortWithinPartitions(p.partitioner().get(), c);
|
||||
|
||||
StructType schema = Encoders.bean(Publication.class).schema();
|
||||
|
||||
// List<Foo> inputValues = Arrays.asList(
|
||||
// new Foo("k",5),
|
||||
// new Foo("a",1),
|
||||
// new Foo("a",30),
|
||||
// new Foo("a",18),
|
||||
// new Foo("a",22),
|
||||
// new Foo("b",22),
|
||||
// new Foo("c",5),
|
||||
// new Foo("a",5),
|
||||
// new Foo("s",1),
|
||||
// new Foo("h",4)
|
||||
// );
|
||||
//
|
||||
//
|
||||
// final JavaPairRDD<Foo, Foo> fooFighters = sc.parallelize(inputValues).mapToPair((PairFunction<Foo, Foo, Foo>) i -> new Tuple2<Foo, Foo>(i, i));
|
||||
//
|
||||
//
|
||||
// FooComparator c = new FooComparator();
|
||||
// final List<Tuple2<String, List<Foo>>> result =
|
||||
// fooFighters.repartitionAndSortWithinPartitions(new FooPartitioner(fooFighters.getNumPartitions()), c)
|
||||
// .mapToPair((PairFunction<Tuple2<Foo, Foo>, String, Foo>) t-> new Tuple2<String,Foo>(t._1().getValue(), t._2()) )
|
||||
// .groupByKey()
|
||||
// .mapValues((Function<Iterable<Foo>, List<Foo>>) Lists::newArrayList)
|
||||
// .collect();
|
||||
//
|
||||
//
|
||||
// System.out.println(result);
|
||||
|
||||
System.out.println(schema);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -44,7 +44,8 @@ public class SparkCreateSimRels {
|
|||
final String inputPath = parser.get("sourcePath");
|
||||
final String entity = parser.get("entity");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||
|
||||
|
||||
final long total = sc.textFile(inputPath + "/" + entity).count();
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
package eu.dnetlib.dedup.graph
|
||||
|
||||
|
||||
import eu.dnetlib.pace.model.MapDocument
|
||||
import org.apache.spark.graphx._
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
||||
|
@ -25,7 +23,7 @@ object GraphProcessor {
|
|||
}
|
||||
val connectedComponents = joinResult.groupByKey()
|
||||
.map[ConnectedComponent](cc => asConnectedComponent(cc))
|
||||
(connectedComponents)
|
||||
connectedComponents
|
||||
}
|
||||
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -1,5 +1,7 @@
|
|||
package eu.dnetlib.dedup;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Before;
|
||||
|
@ -8,36 +10,37 @@ import org.junit.Test;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class SparkCreateDedupTest {
|
||||
|
||||
|
||||
String configuration;
|
||||
|
||||
@Before
|
||||
public void setUp() throws IOException {
|
||||
FileUtils.deleteDirectory(new File("/tmp/pub_dedup_vertex"));
|
||||
FileUtils.deleteDirectory(new File("/tmp/pub_dedup_rels"));
|
||||
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub.curr.conf.json"));
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void dedupTest() throws Exception {
|
||||
final String configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
|
||||
|
||||
|
||||
public void createSimRelsTest() throws Exception {
|
||||
SparkCreateSimRels.main(new String[] {
|
||||
"-mt", "local[*]",
|
||||
"-s", "/home/sandro/betadump",
|
||||
"-s", "/Users/miconis/dumps",
|
||||
"-e", "publication",
|
||||
"-c", configuration,
|
||||
"-t", "/tmp/dedup",
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void createCCTest() throws Exception {
|
||||
|
||||
SparkCreateConnectedComponent.main(new String[] {
|
||||
"-mt", "local[*]",
|
||||
"-s", "/home/sandro/betadump",
|
||||
"-s", "/Users/miconis/dumps",
|
||||
"-e", "publication",
|
||||
"-c", configuration,
|
||||
"-t", "/tmp/dedup",
|
||||
|
@ -49,14 +52,10 @@ public class SparkCreateDedupTest {
|
|||
public void dedupRecordTest() throws Exception {
|
||||
SparkCreateDedupRecord.main(new String[] {
|
||||
"-mt", "local[*]",
|
||||
"-s", "/home/sandro/betadump",
|
||||
"-s", "/Users/miconis/dumps",
|
||||
"-e", "publication",
|
||||
"-c", "configuration",
|
||||
"-c", configuration,
|
||||
"-t", "/tmp/dedup",
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"queueMaxSize": "2000",
|
||||
"groupMaxSize": "50",
|
||||
"slidingWindowSize": "200",
|
||||
"idPath": "$.id",
|
||||
"idPath": ".id",
|
||||
"rootBuilder": [
|
||||
"organization",
|
||||
"projectOrganization_participation_isParticipant",
|
||||
|
@ -117,14 +117,6 @@
|
|||
"host": 0.5,
|
||||
"path": 0.5
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "gridid",
|
||||
"algo": "Null",
|
||||
"type": "String",
|
||||
"weight": "0.0",
|
||||
"ignoreMissing": "true",
|
||||
"path": ".pid[] | select(.qualifier.classid==\"grid\") | .value"
|
||||
}
|
||||
],
|
||||
"blacklists": {
|
||||
|
|
|
@ -1,18 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
<dedupId>yarnRM</dedupId>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
<dedupId>hdfs://nameservice1</dedupId>
|
||||
</property>
|
||||
<property>
|
||||
<name>sourceNN</name>
|
||||
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
|
||||
<dedupId>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</dedupId>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
<dedupId>true</dedupId>
|
||||
</property>
|
||||
</configuration>
|
|
@ -14,12 +14,12 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>hbase_dump_distcp_memory_mb</name>
|
||||
<value>6144</value>
|
||||
<dedupId>6144</dedupId>
|
||||
<description>memory for distcp action copying InfoSpace dump from remote cluster</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hbase_dump_distcp_num_maps</name>
|
||||
<value>1</value>
|
||||
<dedupId>1</dedupId>
|
||||
<description>maximum number of simultaneous copies of InfoSpace dump from remote location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
|
|
@ -1,26 +1,26 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
<dedupId>yarnRM</dedupId>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
<dedupId>hdfs://nameservice1</dedupId>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
<dedupId>true</dedupId>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
<dedupId>spark2</dedupId>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
<dedupId>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</dedupId>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_db_name</name>
|
||||
<value>openaire</value>
|
||||
<dedupId>openaire</dedupId>
|
||||
</property>
|
||||
</configuration>
|
|
@ -54,7 +54,7 @@ Properties overriding order is the following:
|
|||
2. `~/.dhp/application.properties` defined properties
|
||||
3. `${workflow.source.dir}/job.properties`
|
||||
4. `job-override.properties` (located in the project root dir)
|
||||
5. `maven -Dparam=value`
|
||||
5. `maven -Dparam=dedupId`
|
||||
|
||||
where the maven `-Dparam` property is overriding all the other ones.
|
||||
|
||||
|
@ -73,7 +73,7 @@ Workflow definition requirements
|
|||
|
||||
This property can be set using maven `-D` switch.
|
||||
|
||||
`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value.
|
||||
`[oozie_app]` is the default directory name however it can be set to any dedupId as soon as `oozieAppDir` property is provided with directory name as dedupId.
|
||||
|
||||
Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory.
|
||||
|
||||
|
|
|
@ -73,7 +73,7 @@
|
|||
<!-- This profile sets properties that are required for test oozie workflows To be used only with 'oozie-package' profile -->
|
||||
<id>attach-test-resources</id>
|
||||
<properties>
|
||||
<!--overriding default scope (set to 'runtime') with the 'test' value. Test resources attached to oozie package requires all test dependencies. -->
|
||||
<!--overriding default scope (set to 'runtime') with the 'test' dedupId. Test resources attached to oozie package requires all test dependencies. -->
|
||||
<oozie.package.dependencies.include.scope />
|
||||
<oozie.package.dependencies.exclude.scope>provided</oozie.package.dependencies.exclude.scope>
|
||||
<!-- Do not skip creation of test jar for priming (in oozie-package profile) -->
|
||||
|
@ -326,7 +326,7 @@
|
|||
</goals>
|
||||
<configuration>
|
||||
<tasks>
|
||||
<property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
|
||||
<property name="assembly-resources.loc" dedupId="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
|
||||
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
|
||||
</tasks>
|
||||
</configuration>
|
||||
|
|
Loading…
Reference in New Issue