implementation of the sorting by trust mechanism and the merge of oaf entities

This commit is contained in:
miconis 2019-12-10 14:57:16 +01:00
parent cc63706347
commit 4b66b471a4
31 changed files with 769 additions and 1877 deletions

View File

@ -23,4 +23,23 @@ public class Context implements Serializable {
public void setDataInfo(List<DataInfo> dataInfo) {
this.dataInfo = dataInfo;
}
@Override
public int hashCode() {
return id.hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Context other = (Context) obj;
return id.equals(other.getId());
}
}

View File

@ -23,4 +23,25 @@ public class Field<T> implements Serializable {
public void setDataInfo(DataInfo dataInfo) {
this.dataInfo = dataInfo;
}
@Override
public int hashCode(){
return getValue().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Field<T> other = (Field<T>) obj;
return getValue().equals(other.getValue());
}
}

View File

@ -33,4 +33,27 @@ public class KeyValue implements Serializable {
public void setDataInfo(DataInfo dataInfo) {
this.dataInfo = dataInfo;
}
public String toComparableString() {
return String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
}
@Override
public int hashCode() {
return toComparableString().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
KeyValue other = (KeyValue) obj;
return toComparableString().equals(other.toComparableString());
}
}

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;
public abstract class OafEntity extends Oaf implements Serializable {
@ -84,4 +85,32 @@ public abstract class OafEntity extends Oaf implements Serializable {
public void setOaiprovenance(OAIProvenance oaiprovenance) {
this.oaiprovenance = oaiprovenance;
}
public void mergeFrom(OafEntity e) {
if (e == null)
return;
originalId = mergeLists(originalId, e.getOriginalId());
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
pid = mergeLists(pid, e.getPid());
dateofcollection = e.getDateofcollection();
dateoftransformation = e.getDateoftransformation();
extraInfo = mergeLists(extraInfo, e.getExtraInfo());
oaiprovenance = e.getOaiprovenance();
}
protected <T> List<T> mergeLists(final List<T>... lists) {
return Arrays.stream(lists).filter(Objects::nonNull).flatMap(List::stream).distinct().collect(Collectors.toList());
}
}

View File

@ -14,4 +14,16 @@ public class Publication extends Result implements Serializable {
public void setJournal(Journal journal) {
this.journal = journal;
}
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
Publication p = (Publication) e;
if (p.getJournal() != null)
journal = p.getJournal();
}
}

View File

@ -40,4 +40,32 @@ public class Qualifier implements Serializable {
public void setSchemename(String schemename) {
this.schemename = schemename;
}
public String toComparableString() {
return String.format("%s::%s::%s::%s",
classid != null ? classid : "",
classname != null ? classname : "",
schemeid != null ? schemeid : "",
schemename != null ? schemename : "");
}
@Override
public int hashCode() {
return toComparableString().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Qualifier other = (Qualifier) obj;
return toComparableString()
.equals(other.toComparableString());
}
}

View File

@ -1,7 +1,11 @@
package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import java.util.Objects;
public abstract class Result extends OafEntity implements Serializable {
@ -12,35 +16,35 @@ public abstract class Result extends OafEntity implements Serializable {
// common fields
private Qualifier language;
private List<Qualifier> country;
private List<StructuredProperty> subject;
private List<StructuredProperty> title;
private List<StructuredProperty> relevantdate;
private List<Field<String>> description;
private Field<String> dateofacceptance;
private Field<String> publisher;
private Field<String> embargoenddate;
private List<Field<String>> source;
private List<Field<String>> fulltext; // remove candidate
private List<Field<String>> format;
private List<Field<String>> contributor;
private Qualifier resourcetype;
private List<Field<String>> coverage;
private Field<String> refereed; //peer-review status
private List<Context> context;
@ -240,4 +244,145 @@ public abstract class Result extends OafEntity implements Serializable {
this.processingchargecurrency = processingchargecurrency;
return this;
}
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
Result r = (Result) e;
mergeAuthors(r.getAuthor());
//TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function)
// if (author == null)
// author = r.getAuthor(); //authors will be replaced because they could be too much
// dateofacceptance = r.getDateofacceptance();
// instance = mergeLists(instance, r.getInstance());
if (r.getResulttype() != null)
resulttype = r.getResulttype();
if (r.getLanguage() != null)
language = r.getLanguage();
country = mergeLists(country, r.getCountry());
subject = mergeLists(subject, r.getSubject());
title = mergeLists(title, r.getTitle());
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
description = mergeLists(description, r.getDescription());
if (r.getPublisher() != null)
publisher = r.getPublisher();
if (r.getEmbargoenddate() != null)
embargoenddate = r.getEmbargoenddate();
source = mergeLists(source, r.getSource());
fulltext = mergeLists(fulltext, r.getFulltext());
format = mergeLists(format, r.getFormat());
contributor = mergeLists(contributor, r.getContributor());
if (r.getResourcetype() != null)
resourcetype = r.getResourcetype();
coverage = mergeLists(coverage, r.getCoverage());
if (r.getRefereed() != null)
refereed = r.getRefereed();
context = mergeLists(context, r.getContext());
if (r.getProcessingchargeamount() != null)
processingchargeamount = r.getProcessingchargeamount();
if (r.getProcessingchargecurrency() != null)
processingchargecurrency = r.getProcessingchargecurrency();
externalReference = mergeLists(externalReference, r.getExternalReference());
}
public void mergeAuthors(List<Author> authors){
int c1 = countAuthorsPids(author);
int c2 = countAuthorsPids(authors);
int s1 = authorsSize(author);
int s2 = authorsSize(authors);
//if both have no authors with pids and authors is bigger than author
if (c1 == 0 && c2 == 0 && author.size()<authors.size()) {
author = authors;
return;
}
//author is null and authors have 0 or more authors with pids
if (c1<c2 && c1<0) {
author = authors;
return;
}
//andiamo a mangiare
// if (author == null && authors == null)
// return;
//
// int c1 = countAuthorsPids(author);
// int c2 = countAuthorsPids(authors);
//
// if (c1<c2 && c1<1){
// author = authors;
// return;
// }
//
// if (c1<c2)
}
public int countAuthorsPids(List<Author> authors){
if (authors == null)
return -1;
return (int) authors.stream().map(this::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count();
}
public int authorsSize(List<Author> authors){
if (authors == null)
return 0;
return authors.size();
}
public String extractAuthorPid(Author a){
if(a == null || a.getPid() == null || a.getPid().size() == 0)
return null;
StringBuilder mainPid = new StringBuilder();
a.getPid().forEach(pid ->{
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) {
mainPid.setLength(0);
mainPid.append(pid.getValue());
}
else {
if(mainPid.length() == 0)
mainPid.append(pid.getValue());
}
});
return mainPid.toString();
}
}

View File

@ -33,4 +33,28 @@ public class StructuredProperty implements Serializable {
public void setDataInfo(DataInfo dataInfo) {
this.dataInfo = dataInfo;
}
public String toComparableString(){
return String.format("%s::%s", value != null ? value.toLowerCase() : "", qualifier != null ? qualifier.toComparableString().toLowerCase() : "");
}
@Override
public int hashCode() {
return toComparableString().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
StructuredProperty other = (StructuredProperty) obj;
return toComparableString()
.equals(other.toComparableString());
}
}

View File

@ -0,0 +1,89 @@
package eu.dnetlib.dhp.schema.oaf;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class MergeTest {
OafEntity oaf;
@Before
public void setUp() {
oaf = new Publication();
}
@Test
public void mergeListsTest() {
//string list merge test
List<String> a = Arrays.asList("a", "b", "c", "e");
List<String> b = Arrays.asList("a", "b", "c", "d");
List<String> c = null;
System.out.println("merge result 1 = " + oaf.mergeLists(a, b));
System.out.println("merge result 2 = " + oaf.mergeLists(a, c));
System.out.println("merge result 3 = " + oaf.mergeLists(c, c));
}
@Test
public void mergePublicationCollectedFromTest() {
Publication a = new Publication();
Publication b = new Publication();
a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed")));
b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open")));
a.mergeFrom(b);
Assert.assertNotNull(a.getCollectedfrom());
Assert.assertEquals(3, a.getCollectedfrom().size());
}
@Test
public void mergePublicationSubjectTest() {
Publication a = new Publication();
Publication b = new Publication();
a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe")));
b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe")));
a.mergeFrom(b);
Assert.assertNotNull(a.getSubject());
Assert.assertEquals(3, a.getSubject().size());
}
private KeyValue setKV(final String key, final String value) {
KeyValue k = new KeyValue();
k.setKey(key);
k.setValue(value);
return k;
}
private StructuredProperty setSP(final String value, final String schema, final String classname) {
StructuredProperty s = new StructuredProperty();
s.setValue(value);
Qualifier q = new Qualifier();
q.setClassname(classname);
q.setClassid(classname);
q.setSchemename(schema);
q.setSchemeid(schema);
s.setQualifier(q);
return s;
}
}

View File

@ -9,7 +9,7 @@
<xsl:copy-of select="//oai:header"/>
<metadata>
<xsl:for-each select="//*[local-name()='subject']">
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
<subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
</xsl:for-each>
</metadata>
<oaf:about>

View File

@ -1,11 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
<RESOURCE_TYPE value="TransformationRuleDSResourceType"/>
<RESOURCE_KIND value="TransformationRuleDSResources"/>
<RESOURCE_URI value=""/>
<DATE_OF_CREATION value="2019-04-11T11:15:30+00:00"/>
<RESOURCE_IDENTIFIER dedupId="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
<RESOURCE_TYPE dedupId="TransformationRuleDSResourceType"/>
<RESOURCE_KIND dedupId="TransformationRuleDSResources"/>
<RESOURCE_URI dedupId=""/>
<DATE_OF_CREATION dedupId="2019-04-11T11:15:30+00:00"/>
</HEADER>
<BODY>
<CONFIGURATION>
@ -24,7 +24,7 @@
<xsl:copy-of select="//oai:header"/>
<metadata>
<xsl:for-each select="//*[local-name()='subject']">
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
<subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
</xsl:for-each>
</metadata>
<oaf:about>

View File

@ -0,0 +1,169 @@
package eu.dnetlib.dedup;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Random;
import static java.util.stream.Collectors.toMap;
public class DedupRecordFactory {
public JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){
//<id, json_entity>
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
.mapToPair((PairFunction<String,String,String>) it->
new Tuple2<String, String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
);
//<source, target>: source is the dedup_id, target is the id of the mergedIn
JavaPairRDD<String,String> mergeRels = spark
.read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.mapToPair(
(PairFunction<Relation, String,String>)r->
new Tuple2<String, String>(r.getTarget(), r.getSource())
);
//<dedup_id, json_entity_merged>
final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
JavaPairRDD<OafKey, String> keyJson = joinResult.mapToPair((PairFunction<Tuple2<String, String>, OafKey, String>) json -> {
String idValue = json._1();
String trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2());
//TODO remember to replace this with the actual trust retrieving
if (StringUtils.isBlank(trust)) {
Random generator = new Random();
int number = generator.nextInt(20);
double result = (number / 100.0) + 0.80;
trust = "" + result;
}
return new Tuple2<OafKey, String>(new OafKey(idValue, trust), json._2());
});
OafComparator c = new OafComparator();
//<dedup_id, mergedRecordsSortedByTrust>
JavaPairRDD<String, Iterable<String>> sortedJoinResult = keyJson.repartitionAndSortWithinPartitions(new OafPartitioner(keyJson.getNumPartitions()), c)
.mapToPair((PairFunction<Tuple2<OafKey, String>, String, String>) t -> new Tuple2<String, String>(t._1().getDedupId(), t._2()))
.groupByKey();
switch(entityType){
case Publication:
return sortedJoinResult.map(this::publicationMerger);
case Dataset:
return sortedJoinResult.map(this::datasetMerger);
case Project:
return sortedJoinResult.map(this::projectMerger);
case Software:
return sortedJoinResult.map(this::softwareMerger);
case Datasource:
return sortedJoinResult.map(this::datasourceMerger);
case Organization:
return sortedJoinResult.map(this::organizationMerger);
case OtherResearchProduct:
return sortedJoinResult.map(this::otherresearchproductMerger);
default:
return null;
}
}
private Publication publicationMerger(Tuple2<String, Iterable<String>> e){
Publication p = new Publication(); //the result of the merge, to be returned at the end
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
final Collection<String> dateofacceptance = Lists.newArrayList();
final Collection<List<Author>> authors = Lists.newArrayList();
final Collection<List<Instance>> instances = Lists.newArrayList();
StringBuilder trust = new StringBuilder("0.0");
e._2().forEach(pub -> {
try {
Publication publication = mapper.readValue(pub, Publication.class);
final String currentTrust = publication.getDataInfo().getTrust();
if (!currentTrust.equals("1.0")) {
trust.setLength(0);
trust.append(currentTrust);
}
p.mergeFrom(publication);
//add to the list if they are not null
if (publication.getDateofacceptance() != null)
dateofacceptance.add(publication.getDateofacceptance().getValue());
if (publication.getAuthor() != null)
authors.add(publication.getAuthor());
if (publication.getInstance() != null)
instances.add(publication.getInstance());
} catch (Exception exc){}
});
p.setAuthor(null); //TODO create a single list of authors to put in the final publication
return p;
}
private Dataset datasetMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private Project projectMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private Software softwareMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private Datasource datasourceMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private Organization organizationMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
private OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e){
throw new NotImplementedException();
}
}

View File

@ -0,0 +1,15 @@
package eu.dnetlib.dedup;
import com.google.common.collect.ComparisonChain;
import java.io.Serializable;
import java.util.Comparator;
public class OafComparator implements Comparator<OafKey>, Serializable {
@Override
public int compare(OafKey a, OafKey b) {
return ComparisonChain.start()
.compare(a.getDedupId(), b.getDedupId())
.compare(a.getTrust(), b.getTrust())
.result();
}
}

View File

@ -0,0 +1,13 @@
package eu.dnetlib.dedup;
public enum OafEntityType {
Datasource,
Organization,
Project,
Dataset,
OtherResearchProduct,
Software,
Publication
}

View File

@ -0,0 +1,31 @@
package eu.dnetlib.dedup;
import java.io.Serializable;
public class OafKey implements Serializable {
private String dedupId;
private String trust;
public OafKey(String dedupId, String trust) {
this.dedupId = dedupId;
this.trust = trust;
}
public OafKey() {
}
public String getDedupId() {
return dedupId;
}
public void setDedupId(String dedupId) {
this.dedupId = dedupId;
}
public String getTrust() {
return trust;
}
public void setTrust(String trust) {
this.trust = trust;
}
@Override
public String toString(){
return String.format("%s->%d", dedupId,trust);
}
}

View File

@ -0,0 +1,59 @@
package eu.dnetlib.dedup;
import org.apache.spark.Partitioner;
import java.io.Serializable;
public class OafPartitioner extends Partitioner implements Serializable {
private final int numPartitions;
public OafPartitioner(int partitions) {
assert (partitions > 0);
this.numPartitions = partitions;
}
@Override
public int numPartitions() {
return numPartitions;
}
@Override
public int getPartition(Object key) {
if (key instanceof OafKey) {
@SuppressWarnings("unchecked")
OafKey item = (OafKey) key;
return Math.abs(item.getDedupId().hashCode() % numPartitions);
} else {
throw new IllegalArgumentException("Unexpected Key");
}
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + numPartitions;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (!(obj instanceof OafPartitioner)) {
return false;
}
//
OafPartitioner other = (OafPartitioner) obj;
if (numPartitions != other.numPartitions) {
return false;
}
//
return true;
}
}

View File

@ -37,8 +37,8 @@ public class SparkCreateConnectedComponent {
final String inputPath = parser.get("sourcePath");
final String entity = parser.get("entity");
final String targetPath = parser.get("targetPath");
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final JavaPairRDD<Object, String> vertexes = sc.textFile(inputPath + "/" + entity)
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dedup;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
@ -17,6 +18,7 @@ import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import java.util.ArrayList;
@ -27,26 +29,28 @@ import java.util.List;
public class SparkCreateDedupRecord {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkCreateDedupRecord.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String entity = parser.get("entity");
final String targetPath = parser.get("targetPath");
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity)
.mapToPair((PairFunction<String,String,String>)it->
new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
);
// final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
// parser.parseArgument(args);
// final SparkSession spark = SparkSession
// .builder()
// .appName(SparkCreateDedupRecord.class.getSimpleName())
// .master(parser.get("master"))
// .getOrCreate();
//
// final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
// final String inputPath = parser.get("sourcePath");
// final String entity = parser.get("entity");
// final String targetPath = parser.get("targetPath");
//// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
// final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
//
// //<id, json_entity>
// final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity)
// .mapToPair((PairFunction<String,String,String>)it->
// new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
// );
// //<source, target>: source is the dedup_id, target is the id of the mergedIn
// JavaPairRDD<String,String> mergeRels = spark
// .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class))
// .where("relClass=='merges'")
@ -56,46 +60,12 @@ public class SparkCreateDedupRecord {
// new Tuple2<String,String>(r.getTarget(), r.getSource())
// );
//
//
// //<dedup_id, json_entity_merged>
// final JavaPairRDD<String, String> p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
//
// Comparator<String> c = new Comparator<String>() {
// @Override
// public int compare(String s, String t1) {
// return 0;
// }
// };
// final JavaPairRDD<String, String> stringStringJavaPairRDD = p.repartitionAndSortWithinPartitions(p.partitioner().get(), c);
StructType schema = Encoders.bean(Publication.class).schema();
// List<Foo> inputValues = Arrays.asList(
// new Foo("k",5),
// new Foo("a",1),
// new Foo("a",30),
// new Foo("a",18),
// new Foo("a",22),
// new Foo("b",22),
// new Foo("c",5),
// new Foo("a",5),
// new Foo("s",1),
// new Foo("h",4)
// );
//
//
// final JavaPairRDD<Foo, Foo> fooFighters = sc.parallelize(inputValues).mapToPair((PairFunction<Foo, Foo, Foo>) i -> new Tuple2<Foo, Foo>(i, i));
//
//
// FooComparator c = new FooComparator();
// final List<Tuple2<String, List<Foo>>> result =
// fooFighters.repartitionAndSortWithinPartitions(new FooPartitioner(fooFighters.getNumPartitions()), c)
// .mapToPair((PairFunction<Tuple2<Foo, Foo>, String, Foo>) t-> new Tuple2<String,Foo>(t._1().getValue(), t._2()) )
// .groupByKey()
// .mapValues((Function<Iterable<Foo>, List<Foo>>) Lists::newArrayList)
// .collect();
//
//
// System.out.println(result);
System.out.println(schema);
}
}

View File

@ -44,7 +44,8 @@ public class SparkCreateSimRels {
final String inputPath = parser.get("sourcePath");
final String entity = parser.get("entity");
final String targetPath = parser.get("targetPath");
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final long total = sc.textFile(inputPath + "/" + entity).count();

View File

@ -1,7 +1,5 @@
package eu.dnetlib.dedup.graph
import eu.dnetlib.pace.model.MapDocument
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
@ -25,7 +23,7 @@ object GraphProcessor {
}
val connectedComponents = joinResult.groupByKey()
.map[ConnectedComponent](cc => asConnectedComponent(cc))
(connectedComponents)
connectedComponents
}

View File

@ -1,5 +1,7 @@
package eu.dnetlib.dedup;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Publication;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.Before;
@ -8,36 +10,37 @@ import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.util.List;
public class SparkCreateDedupTest {
String configuration;
@Before
public void setUp() throws IOException {
FileUtils.deleteDirectory(new File("/tmp/pub_dedup_vertex"));
FileUtils.deleteDirectory(new File("/tmp/pub_dedup_rels"));
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub.curr.conf.json"));
}
@Test
@Ignore
public void dedupTest() throws Exception {
final String configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
public void createSimRelsTest() throws Exception {
SparkCreateSimRels.main(new String[] {
"-mt", "local[*]",
"-s", "/home/sandro/betadump",
"-s", "/Users/miconis/dumps",
"-e", "publication",
"-c", configuration,
"-t", "/tmp/dedup",
});
}
@Test
@Ignore
public void createCCTest() throws Exception {
SparkCreateConnectedComponent.main(new String[] {
"-mt", "local[*]",
"-s", "/home/sandro/betadump",
"-s", "/Users/miconis/dumps",
"-e", "publication",
"-c", configuration,
"-t", "/tmp/dedup",
@ -49,14 +52,10 @@ public class SparkCreateDedupTest {
public void dedupRecordTest() throws Exception {
SparkCreateDedupRecord.main(new String[] {
"-mt", "local[*]",
"-s", "/home/sandro/betadump",
"-s", "/Users/miconis/dumps",
"-e", "publication",
"-c", "configuration",
"-c", configuration,
"-t", "/tmp/dedup",
});
}
}

View File

@ -7,7 +7,7 @@
"queueMaxSize": "2000",
"groupMaxSize": "50",
"slidingWindowSize": "200",
"idPath": "$.id",
"idPath": ".id",
"rootBuilder": [
"organization",
"projectOrganization_participation_isParticipant",
@ -117,14 +117,6 @@
"host": 0.5,
"path": 0.5
}
},
{
"name": "gridid",
"algo": "Null",
"type": "String",
"weight": "0.0",
"ignoreMissing": "true",
"path": ".pid[] | select(.qualifier.classid==\"grid\") | .value"
}
],
"blacklists": {

View File

@ -1,18 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
<dedupId>yarnRM</dedupId>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
<dedupId>hdfs://nameservice1</dedupId>
</property>
<property>
<name>sourceNN</name>
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
<dedupId>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</dedupId>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
<dedupId>true</dedupId>
</property>
</configuration>

View File

@ -14,12 +14,12 @@
</property>
<property>
<name>hbase_dump_distcp_memory_mb</name>
<value>6144</value>
<dedupId>6144</dedupId>
<description>memory for distcp action copying InfoSpace dump from remote cluster</description>
</property>
<property>
<name>hbase_dump_distcp_num_maps</name>
<value>1</value>
<dedupId>1</dedupId>
<description>maximum number of simultaneous copies of InfoSpace dump from remote location</description>
</property>
</parameters>

View File

@ -1,26 +1,26 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
<dedupId>yarnRM</dedupId>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
<dedupId>hdfs://nameservice1</dedupId>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
<dedupId>true</dedupId>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
<dedupId>spark2</dedupId>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
<dedupId>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</dedupId>
</property>
<property>
<name>hive_db_name</name>
<value>openaire</value>
<dedupId>openaire</dedupId>
</property>
</configuration>

View File

@ -54,7 +54,7 @@ Properties overriding order is the following:
2. `~/.dhp/application.properties` defined properties
3. `${workflow.source.dir}/job.properties`
4. `job-override.properties` (located in the project root dir)
5. `maven -Dparam=value`
5. `maven -Dparam=dedupId`
where the maven `-Dparam` property is overriding all the other ones.
@ -73,7 +73,7 @@ Workflow definition requirements
This property can be set using maven `-D` switch.
`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value.
`[oozie_app]` is the default directory name however it can be set to any dedupId as soon as `oozieAppDir` property is provided with directory name as dedupId.
Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory.

View File

@ -73,7 +73,7 @@
<!-- This profile sets properties that are required for test oozie workflows To be used only with 'oozie-package' profile -->
<id>attach-test-resources</id>
<properties>
<!--overriding default scope (set to 'runtime') with the 'test' value. Test resources attached to oozie package requires all test dependencies. -->
<!--overriding default scope (set to 'runtime') with the 'test' dedupId. Test resources attached to oozie package requires all test dependencies. -->
<oozie.package.dependencies.include.scope />
<oozie.package.dependencies.exclude.scope>provided</oozie.package.dependencies.exclude.scope>
<!-- Do not skip creation of test jar for priming (in oozie-package profile) -->
@ -326,7 +326,7 @@
</goals>
<configuration>
<tasks>
<property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
<property name="assembly-resources.loc" dedupId="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
</tasks>
</configuration>