forked from antonis.lempesis/dnet-hadoop
implementation of the sorting by trust mechanism and the merge of oaf entities
This commit is contained in:
parent
cc63706347
commit
4b66b471a4
|
@ -23,4 +23,23 @@ public class Context implements Serializable {
|
||||||
public void setDataInfo(List<DataInfo> dataInfo) {
|
public void setDataInfo(List<DataInfo> dataInfo) {
|
||||||
this.dataInfo = dataInfo;
|
this.dataInfo = dataInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return id.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Context other = (Context) obj;
|
||||||
|
|
||||||
|
return id.equals(other.getId());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,4 +23,25 @@ public class Field<T> implements Serializable {
|
||||||
public void setDataInfo(DataInfo dataInfo) {
|
public void setDataInfo(DataInfo dataInfo) {
|
||||||
this.dataInfo = dataInfo;
|
this.dataInfo = dataInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode(){
|
||||||
|
return getValue().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Field<T> other = (Field<T>) obj;
|
||||||
|
|
||||||
|
return getValue().equals(other.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,4 +33,27 @@ public class KeyValue implements Serializable {
|
||||||
public void setDataInfo(DataInfo dataInfo) {
|
public void setDataInfo(DataInfo dataInfo) {
|
||||||
this.dataInfo = dataInfo;
|
this.dataInfo = dataInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toComparableString() {
|
||||||
|
return String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
KeyValue other = (KeyValue) obj;
|
||||||
|
|
||||||
|
return toComparableString().equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public abstract class OafEntity extends Oaf implements Serializable {
|
public abstract class OafEntity extends Oaf implements Serializable {
|
||||||
|
|
||||||
|
@ -84,4 +85,32 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
||||||
public void setOaiprovenance(OAIProvenance oaiprovenance) {
|
public void setOaiprovenance(OAIProvenance oaiprovenance) {
|
||||||
this.oaiprovenance = oaiprovenance;
|
this.oaiprovenance = oaiprovenance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
|
||||||
|
if (e == null)
|
||||||
|
return;
|
||||||
|
|
||||||
|
originalId = mergeLists(originalId, e.getOriginalId());
|
||||||
|
|
||||||
|
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
|
||||||
|
|
||||||
|
pid = mergeLists(pid, e.getPid());
|
||||||
|
|
||||||
|
dateofcollection = e.getDateofcollection();
|
||||||
|
|
||||||
|
dateoftransformation = e.getDateoftransformation();
|
||||||
|
|
||||||
|
extraInfo = mergeLists(extraInfo, e.getExtraInfo());
|
||||||
|
|
||||||
|
oaiprovenance = e.getOaiprovenance();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected <T> List<T> mergeLists(final List<T>... lists) {
|
||||||
|
|
||||||
|
return Arrays.stream(lists).filter(Objects::nonNull).flatMap(List::stream).distinct().collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,4 +14,16 @@ public class Publication extends Result implements Serializable {
|
||||||
public void setJournal(Journal journal) {
|
public void setJournal(Journal journal) {
|
||||||
this.journal = journal;
|
this.journal = journal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
|
||||||
|
Publication p = (Publication) e;
|
||||||
|
|
||||||
|
if (p.getJournal() != null)
|
||||||
|
journal = p.getJournal();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,4 +40,32 @@ public class Qualifier implements Serializable {
|
||||||
public void setSchemename(String schemename) {
|
public void setSchemename(String schemename) {
|
||||||
this.schemename = schemename;
|
this.schemename = schemename;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toComparableString() {
|
||||||
|
return String.format("%s::%s::%s::%s",
|
||||||
|
classid != null ? classid : "",
|
||||||
|
classname != null ? classname : "",
|
||||||
|
schemeid != null ? schemeid : "",
|
||||||
|
schemename != null ? schemename : "");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Qualifier other = (Qualifier) obj;
|
||||||
|
|
||||||
|
return toComparableString()
|
||||||
|
.equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public abstract class Result extends OafEntity implements Serializable {
|
public abstract class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
|
@ -240,4 +244,145 @@ public abstract class Result extends OafEntity implements Serializable {
|
||||||
this.processingchargecurrency = processingchargecurrency;
|
this.processingchargecurrency = processingchargecurrency;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
|
||||||
|
Result r = (Result) e;
|
||||||
|
|
||||||
|
mergeAuthors(r.getAuthor());
|
||||||
|
|
||||||
|
//TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function)
|
||||||
|
// if (author == null)
|
||||||
|
// author = r.getAuthor(); //authors will be replaced because they could be too much
|
||||||
|
// dateofacceptance = r.getDateofacceptance();
|
||||||
|
// instance = mergeLists(instance, r.getInstance());
|
||||||
|
|
||||||
|
if (r.getResulttype() != null)
|
||||||
|
resulttype = r.getResulttype();
|
||||||
|
|
||||||
|
if (r.getLanguage() != null)
|
||||||
|
language = r.getLanguage();
|
||||||
|
|
||||||
|
country = mergeLists(country, r.getCountry());
|
||||||
|
|
||||||
|
subject = mergeLists(subject, r.getSubject());
|
||||||
|
|
||||||
|
title = mergeLists(title, r.getTitle());
|
||||||
|
|
||||||
|
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
|
||||||
|
|
||||||
|
description = mergeLists(description, r.getDescription());
|
||||||
|
|
||||||
|
if (r.getPublisher() != null)
|
||||||
|
publisher = r.getPublisher();
|
||||||
|
|
||||||
|
if (r.getEmbargoenddate() != null)
|
||||||
|
embargoenddate = r.getEmbargoenddate();
|
||||||
|
|
||||||
|
source = mergeLists(source, r.getSource());
|
||||||
|
|
||||||
|
fulltext = mergeLists(fulltext, r.getFulltext());
|
||||||
|
|
||||||
|
format = mergeLists(format, r.getFormat());
|
||||||
|
|
||||||
|
contributor = mergeLists(contributor, r.getContributor());
|
||||||
|
|
||||||
|
if (r.getResourcetype() != null)
|
||||||
|
resourcetype = r.getResourcetype();
|
||||||
|
|
||||||
|
coverage = mergeLists(coverage, r.getCoverage());
|
||||||
|
|
||||||
|
if (r.getRefereed() != null)
|
||||||
|
refereed = r.getRefereed();
|
||||||
|
|
||||||
|
context = mergeLists(context, r.getContext());
|
||||||
|
|
||||||
|
if (r.getProcessingchargeamount() != null)
|
||||||
|
processingchargeamount = r.getProcessingchargeamount();
|
||||||
|
|
||||||
|
if (r.getProcessingchargecurrency() != null)
|
||||||
|
processingchargecurrency = r.getProcessingchargecurrency();
|
||||||
|
|
||||||
|
externalReference = mergeLists(externalReference, r.getExternalReference());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void mergeAuthors(List<Author> authors){
|
||||||
|
int c1 = countAuthorsPids(author);
|
||||||
|
int c2 = countAuthorsPids(authors);
|
||||||
|
int s1 = authorsSize(author);
|
||||||
|
int s2 = authorsSize(authors);
|
||||||
|
|
||||||
|
|
||||||
|
//if both have no authors with pids and authors is bigger than author
|
||||||
|
if (c1 == 0 && c2 == 0 && author.size()<authors.size()) {
|
||||||
|
author = authors;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
//author is null and authors have 0 or more authors with pids
|
||||||
|
if (c1<c2 && c1<0) {
|
||||||
|
author = authors;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
//andiamo a mangiare
|
||||||
|
|
||||||
|
|
||||||
|
// if (author == null && authors == null)
|
||||||
|
// return;
|
||||||
|
//
|
||||||
|
// int c1 = countAuthorsPids(author);
|
||||||
|
// int c2 = countAuthorsPids(authors);
|
||||||
|
//
|
||||||
|
// if (c1<c2 && c1<1){
|
||||||
|
// author = authors;
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// if (c1<c2)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public int countAuthorsPids(List<Author> authors){
|
||||||
|
if (authors == null)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
return (int) authors.stream().map(this::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int authorsSize(List<Author> authors){
|
||||||
|
if (authors == null)
|
||||||
|
return 0;
|
||||||
|
return authors.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String extractAuthorPid(Author a){
|
||||||
|
|
||||||
|
if(a == null || a.getPid() == null || a.getPid().size() == 0)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
StringBuilder mainPid = new StringBuilder();
|
||||||
|
|
||||||
|
a.getPid().forEach(pid ->{
|
||||||
|
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) {
|
||||||
|
mainPid.setLength(0);
|
||||||
|
mainPid.append(pid.getValue());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if(mainPid.length() == 0)
|
||||||
|
mainPid.append(pid.getValue());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return mainPid.toString();
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,4 +33,28 @@ public class StructuredProperty implements Serializable {
|
||||||
public void setDataInfo(DataInfo dataInfo) {
|
public void setDataInfo(DataInfo dataInfo) {
|
||||||
this.dataInfo = dataInfo;
|
this.dataInfo = dataInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toComparableString(){
|
||||||
|
return String.format("%s::%s", value != null ? value.toLowerCase() : "", qualifier != null ? qualifier.toComparableString().toLowerCase() : "");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
StructuredProperty other = (StructuredProperty) obj;
|
||||||
|
|
||||||
|
return toComparableString()
|
||||||
|
.equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,89 @@
|
||||||
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class MergeTest {
|
||||||
|
|
||||||
|
OafEntity oaf;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() {
|
||||||
|
oaf = new Publication();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergeListsTest() {
|
||||||
|
|
||||||
|
//string list merge test
|
||||||
|
List<String> a = Arrays.asList("a", "b", "c", "e");
|
||||||
|
List<String> b = Arrays.asList("a", "b", "c", "d");
|
||||||
|
List<String> c = null;
|
||||||
|
|
||||||
|
System.out.println("merge result 1 = " + oaf.mergeLists(a, b));
|
||||||
|
|
||||||
|
System.out.println("merge result 2 = " + oaf.mergeLists(a, c));
|
||||||
|
|
||||||
|
System.out.println("merge result 3 = " + oaf.mergeLists(c, c));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergePublicationCollectedFromTest() {
|
||||||
|
|
||||||
|
Publication a = new Publication();
|
||||||
|
Publication b = new Publication();
|
||||||
|
|
||||||
|
a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed")));
|
||||||
|
b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open")));
|
||||||
|
|
||||||
|
a.mergeFrom(b);
|
||||||
|
|
||||||
|
Assert.assertNotNull(a.getCollectedfrom());
|
||||||
|
Assert.assertEquals(3, a.getCollectedfrom().size());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergePublicationSubjectTest() {
|
||||||
|
|
||||||
|
Publication a = new Publication();
|
||||||
|
Publication b = new Publication();
|
||||||
|
|
||||||
|
a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe")));
|
||||||
|
b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe")));
|
||||||
|
|
||||||
|
a.mergeFrom(b);
|
||||||
|
|
||||||
|
Assert.assertNotNull(a.getSubject());
|
||||||
|
Assert.assertEquals(3, a.getSubject().size());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private KeyValue setKV(final String key, final String value) {
|
||||||
|
|
||||||
|
KeyValue k = new KeyValue();
|
||||||
|
|
||||||
|
k.setKey(key);
|
||||||
|
k.setValue(value);
|
||||||
|
|
||||||
|
return k;
|
||||||
|
}
|
||||||
|
|
||||||
|
private StructuredProperty setSP(final String value, final String schema, final String classname) {
|
||||||
|
StructuredProperty s = new StructuredProperty();
|
||||||
|
s.setValue(value);
|
||||||
|
Qualifier q = new Qualifier();
|
||||||
|
q.setClassname(classname);
|
||||||
|
q.setClassid(classname);
|
||||||
|
q.setSchemename(schema);
|
||||||
|
q.setSchemeid(schema);
|
||||||
|
s.setQualifier(q);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
|
@ -9,7 +9,7 @@
|
||||||
<xsl:copy-of select="//oai:header"/>
|
<xsl:copy-of select="//oai:header"/>
|
||||||
<metadata>
|
<metadata>
|
||||||
<xsl:for-each select="//*[local-name()='subject']">
|
<xsl:for-each select="//*[local-name()='subject']">
|
||||||
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
|
<subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</metadata>
|
</metadata>
|
||||||
<oaf:about>
|
<oaf:about>
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<RESOURCE_PROFILE>
|
<RESOURCE_PROFILE>
|
||||||
<HEADER>
|
<HEADER>
|
||||||
<RESOURCE_IDENTIFIER value="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
|
<RESOURCE_IDENTIFIER dedupId="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
|
||||||
<RESOURCE_TYPE value="TransformationRuleDSResourceType"/>
|
<RESOURCE_TYPE dedupId="TransformationRuleDSResourceType"/>
|
||||||
<RESOURCE_KIND value="TransformationRuleDSResources"/>
|
<RESOURCE_KIND dedupId="TransformationRuleDSResources"/>
|
||||||
<RESOURCE_URI value=""/>
|
<RESOURCE_URI dedupId=""/>
|
||||||
<DATE_OF_CREATION value="2019-04-11T11:15:30+00:00"/>
|
<DATE_OF_CREATION dedupId="2019-04-11T11:15:30+00:00"/>
|
||||||
</HEADER>
|
</HEADER>
|
||||||
<BODY>
|
<BODY>
|
||||||
<CONFIGURATION>
|
<CONFIGURATION>
|
||||||
|
@ -24,7 +24,7 @@
|
||||||
<xsl:copy-of select="//oai:header"/>
|
<xsl:copy-of select="//oai:header"/>
|
||||||
<metadata>
|
<metadata>
|
||||||
<xsl:for-each select="//*[local-name()='subject']">
|
<xsl:for-each select="//*[local-name()='subject']">
|
||||||
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
|
<subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</metadata>
|
</metadata>
|
||||||
<oaf:about>
|
<oaf:about>
|
||||||
|
|
|
@ -0,0 +1,169 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import com.google.common.collect.Iterables;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import org.apache.commons.lang.NotImplementedException;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.toMap;
|
||||||
|
|
||||||
|
public class DedupRecordFactory {
|
||||||
|
|
||||||
|
public JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){
|
||||||
|
|
||||||
|
//<id, json_entity>
|
||||||
|
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
|
||||||
|
.mapToPair((PairFunction<String,String,String>) it->
|
||||||
|
new Tuple2<String, String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
|
||||||
|
);
|
||||||
|
|
||||||
|
//<source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||||
|
JavaPairRDD<String,String> mergeRels = spark
|
||||||
|
.read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class))
|
||||||
|
.where("relClass=='merges'")
|
||||||
|
.javaRDD()
|
||||||
|
.mapToPair(
|
||||||
|
(PairFunction<Relation, String,String>)r->
|
||||||
|
new Tuple2<String, String>(r.getTarget(), r.getSource())
|
||||||
|
);
|
||||||
|
|
||||||
|
//<dedup_id, json_entity_merged>
|
||||||
|
final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||||
|
|
||||||
|
JavaPairRDD<OafKey, String> keyJson = joinResult.mapToPair((PairFunction<Tuple2<String, String>, OafKey, String>) json -> {
|
||||||
|
|
||||||
|
String idValue = json._1();
|
||||||
|
|
||||||
|
String trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2());
|
||||||
|
|
||||||
|
//TODO remember to replace this with the actual trust retrieving
|
||||||
|
if (StringUtils.isBlank(trust)) {
|
||||||
|
Random generator = new Random();
|
||||||
|
int number = generator.nextInt(20);
|
||||||
|
double result = (number / 100.0) + 0.80;
|
||||||
|
trust = "" + result;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Tuple2<OafKey, String>(new OafKey(idValue, trust), json._2());
|
||||||
|
});
|
||||||
|
|
||||||
|
OafComparator c = new OafComparator();
|
||||||
|
//<dedup_id, mergedRecordsSortedByTrust>
|
||||||
|
JavaPairRDD<String, Iterable<String>> sortedJoinResult = keyJson.repartitionAndSortWithinPartitions(new OafPartitioner(keyJson.getNumPartitions()), c)
|
||||||
|
.mapToPair((PairFunction<Tuple2<OafKey, String>, String, String>) t -> new Tuple2<String, String>(t._1().getDedupId(), t._2()))
|
||||||
|
.groupByKey();
|
||||||
|
|
||||||
|
|
||||||
|
switch(entityType){
|
||||||
|
case Publication:
|
||||||
|
return sortedJoinResult.map(this::publicationMerger);
|
||||||
|
case Dataset:
|
||||||
|
return sortedJoinResult.map(this::datasetMerger);
|
||||||
|
case Project:
|
||||||
|
return sortedJoinResult.map(this::projectMerger);
|
||||||
|
case Software:
|
||||||
|
return sortedJoinResult.map(this::softwareMerger);
|
||||||
|
case Datasource:
|
||||||
|
return sortedJoinResult.map(this::datasourceMerger);
|
||||||
|
case Organization:
|
||||||
|
return sortedJoinResult.map(this::organizationMerger);
|
||||||
|
case OtherResearchProduct:
|
||||||
|
return sortedJoinResult.map(this::otherresearchproductMerger);
|
||||||
|
default:
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private Publication publicationMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
|
Publication p = new Publication(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
p.setId(e._1());
|
||||||
|
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||||
|
final Collection<List<Author>> authors = Lists.newArrayList();
|
||||||
|
final Collection<List<Instance>> instances = Lists.newArrayList();
|
||||||
|
|
||||||
|
StringBuilder trust = new StringBuilder("0.0");
|
||||||
|
|
||||||
|
e._2().forEach(pub -> {
|
||||||
|
try {
|
||||||
|
Publication publication = mapper.readValue(pub, Publication.class);
|
||||||
|
|
||||||
|
final String currentTrust = publication.getDataInfo().getTrust();
|
||||||
|
if (!currentTrust.equals("1.0")) {
|
||||||
|
trust.setLength(0);
|
||||||
|
trust.append(currentTrust);
|
||||||
|
}
|
||||||
|
|
||||||
|
p.mergeFrom(publication);
|
||||||
|
|
||||||
|
//add to the list if they are not null
|
||||||
|
if (publication.getDateofacceptance() != null)
|
||||||
|
dateofacceptance.add(publication.getDateofacceptance().getValue());
|
||||||
|
if (publication.getAuthor() != null)
|
||||||
|
authors.add(publication.getAuthor());
|
||||||
|
if (publication.getInstance() != null)
|
||||||
|
instances.add(publication.getInstance());
|
||||||
|
|
||||||
|
} catch (Exception exc){}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
p.setAuthor(null); //TODO create a single list of authors to put in the final publication
|
||||||
|
|
||||||
|
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Dataset datasetMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
|
throw new NotImplementedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Project projectMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
|
throw new NotImplementedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Software softwareMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
|
throw new NotImplementedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Datasource datasourceMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
|
throw new NotImplementedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Organization organizationMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
|
throw new NotImplementedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
private OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
|
throw new NotImplementedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,15 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
import com.google.common.collect.ComparisonChain;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
public class OafComparator implements Comparator<OafKey>, Serializable {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(OafKey a, OafKey b) {
|
||||||
|
return ComparisonChain.start()
|
||||||
|
.compare(a.getDedupId(), b.getDedupId())
|
||||||
|
.compare(a.getTrust(), b.getTrust())
|
||||||
|
.result();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,13 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
public enum OafEntityType {
|
||||||
|
|
||||||
|
Datasource,
|
||||||
|
Organization,
|
||||||
|
Project,
|
||||||
|
Dataset,
|
||||||
|
OtherResearchProduct,
|
||||||
|
Software,
|
||||||
|
Publication
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
public class OafKey implements Serializable {
|
||||||
|
|
||||||
|
private String dedupId;
|
||||||
|
private String trust;
|
||||||
|
|
||||||
|
public OafKey(String dedupId, String trust) {
|
||||||
|
this.dedupId = dedupId;
|
||||||
|
this.trust = trust;
|
||||||
|
}
|
||||||
|
public OafKey() {
|
||||||
|
}
|
||||||
|
public String getDedupId() {
|
||||||
|
return dedupId;
|
||||||
|
}
|
||||||
|
public void setDedupId(String dedupId) {
|
||||||
|
this.dedupId = dedupId;
|
||||||
|
}
|
||||||
|
public String getTrust() {
|
||||||
|
return trust;
|
||||||
|
}
|
||||||
|
public void setTrust(String trust) {
|
||||||
|
this.trust = trust;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public String toString(){
|
||||||
|
return String.format("%s->%d", dedupId,trust);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,59 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import org.apache.spark.Partitioner;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class OafPartitioner extends Partitioner implements Serializable {
|
||||||
|
|
||||||
|
private final int numPartitions;
|
||||||
|
|
||||||
|
public OafPartitioner(int partitions) {
|
||||||
|
assert (partitions > 0);
|
||||||
|
this.numPartitions = partitions;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numPartitions() {
|
||||||
|
return numPartitions;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getPartition(Object key) {
|
||||||
|
if (key instanceof OafKey) {
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
OafKey item = (OafKey) key;
|
||||||
|
return Math.abs(item.getDedupId().hashCode() % numPartitions);
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("Unexpected Key");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
final int prime = 31;
|
||||||
|
int result = 1;
|
||||||
|
result = prime * result + numPartitions;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (obj == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!(obj instanceof OafPartitioner)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
//
|
||||||
|
OafPartitioner other = (OafPartitioner) obj;
|
||||||
|
if (numPartitions != other.numPartitions) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
//
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -37,8 +37,8 @@ public class SparkCreateConnectedComponent {
|
||||||
final String inputPath = parser.get("sourcePath");
|
final String inputPath = parser.get("sourcePath");
|
||||||
final String entity = parser.get("entity");
|
final String entity = parser.get("entity");
|
||||||
final String targetPath = parser.get("targetPath");
|
final String targetPath = parser.get("targetPath");
|
||||||
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||||
|
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||||
|
|
||||||
final JavaPairRDD<Object, String> vertexes = sc.textFile(inputPath + "/" + entity)
|
final JavaPairRDD<Object, String> vertexes = sc.textFile(inputPath + "/" + entity)
|
||||||
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dedup;
|
||||||
import com.google.common.collect.ComparisonChain;
|
import com.google.common.collect.ComparisonChain;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
@ -17,6 +18,7 @@ import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.apache.spark.sql.types.StructType;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -27,26 +29,28 @@ import java.util.List;
|
||||||
public class SparkCreateDedupRecord {
|
public class SparkCreateDedupRecord {
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
|
// final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
|
||||||
parser.parseArgument(args);
|
// parser.parseArgument(args);
|
||||||
final SparkSession spark = SparkSession
|
// final SparkSession spark = SparkSession
|
||||||
.builder()
|
// .builder()
|
||||||
.appName(SparkCreateDedupRecord.class.getSimpleName())
|
// .appName(SparkCreateDedupRecord.class.getSimpleName())
|
||||||
.master(parser.get("master"))
|
// .master(parser.get("master"))
|
||||||
.getOrCreate();
|
// .getOrCreate();
|
||||||
|
//
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
// final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
final String inputPath = parser.get("sourcePath");
|
// final String inputPath = parser.get("sourcePath");
|
||||||
final String entity = parser.get("entity");
|
// final String entity = parser.get("entity");
|
||||||
final String targetPath = parser.get("targetPath");
|
// final String targetPath = parser.get("targetPath");
|
||||||
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
//// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||||
|
// final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||||
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity)
|
//
|
||||||
.mapToPair((PairFunction<String,String,String>)it->
|
// //<id, json_entity>
|
||||||
new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
|
// final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity)
|
||||||
);
|
// .mapToPair((PairFunction<String,String,String>)it->
|
||||||
|
// new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
|
||||||
|
// );
|
||||||
|
|
||||||
|
// //<source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||||
// JavaPairRDD<String,String> mergeRels = spark
|
// JavaPairRDD<String,String> mergeRels = spark
|
||||||
// .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class))
|
// .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class))
|
||||||
// .where("relClass=='merges'")
|
// .where("relClass=='merges'")
|
||||||
|
@ -56,46 +60,12 @@ public class SparkCreateDedupRecord {
|
||||||
// new Tuple2<String,String>(r.getTarget(), r.getSource())
|
// new Tuple2<String,String>(r.getTarget(), r.getSource())
|
||||||
// );
|
// );
|
||||||
//
|
//
|
||||||
//
|
// //<dedup_id, json_entity_merged>
|
||||||
// final JavaPairRDD<String, String> p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
// final JavaPairRDD<String, String> p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||||
//
|
|
||||||
// Comparator<String> c = new Comparator<String>() {
|
|
||||||
// @Override
|
|
||||||
// public int compare(String s, String t1) {
|
|
||||||
// return 0;
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
// final JavaPairRDD<String, String> stringStringJavaPairRDD = p.repartitionAndSortWithinPartitions(p.partitioner().get(), c);
|
|
||||||
|
|
||||||
|
StructType schema = Encoders.bean(Publication.class).schema();
|
||||||
|
|
||||||
// List<Foo> inputValues = Arrays.asList(
|
System.out.println(schema);
|
||||||
// new Foo("k",5),
|
|
||||||
// new Foo("a",1),
|
|
||||||
// new Foo("a",30),
|
|
||||||
// new Foo("a",18),
|
|
||||||
// new Foo("a",22),
|
|
||||||
// new Foo("b",22),
|
|
||||||
// new Foo("c",5),
|
|
||||||
// new Foo("a",5),
|
|
||||||
// new Foo("s",1),
|
|
||||||
// new Foo("h",4)
|
|
||||||
// );
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// final JavaPairRDD<Foo, Foo> fooFighters = sc.parallelize(inputValues).mapToPair((PairFunction<Foo, Foo, Foo>) i -> new Tuple2<Foo, Foo>(i, i));
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// FooComparator c = new FooComparator();
|
|
||||||
// final List<Tuple2<String, List<Foo>>> result =
|
|
||||||
// fooFighters.repartitionAndSortWithinPartitions(new FooPartitioner(fooFighters.getNumPartitions()), c)
|
|
||||||
// .mapToPair((PairFunction<Tuple2<Foo, Foo>, String, Foo>) t-> new Tuple2<String,Foo>(t._1().getValue(), t._2()) )
|
|
||||||
// .groupByKey()
|
|
||||||
// .mapValues((Function<Iterable<Foo>, List<Foo>>) Lists::newArrayList)
|
|
||||||
// .collect();
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// System.out.println(result);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,8 @@ public class SparkCreateSimRels {
|
||||||
final String inputPath = parser.get("sourcePath");
|
final String inputPath = parser.get("sourcePath");
|
||||||
final String entity = parser.get("entity");
|
final String entity = parser.get("entity");
|
||||||
final String targetPath = parser.get("targetPath");
|
final String targetPath = parser.get("targetPath");
|
||||||
final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||||
|
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||||
|
|
||||||
|
|
||||||
final long total = sc.textFile(inputPath + "/" + entity).count();
|
final long total = sc.textFile(inputPath + "/" + entity).count();
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
package eu.dnetlib.dedup.graph
|
package eu.dnetlib.dedup.graph
|
||||||
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.MapDocument
|
|
||||||
import org.apache.spark.graphx._
|
import org.apache.spark.graphx._
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
|
|
||||||
|
@ -25,7 +23,7 @@ object GraphProcessor {
|
||||||
}
|
}
|
||||||
val connectedComponents = joinResult.groupByKey()
|
val connectedComponents = joinResult.groupByKey()
|
||||||
.map[ConnectedComponent](cc => asConnectedComponent(cc))
|
.map[ConnectedComponent](cc => asConnectedComponent(cc))
|
||||||
(connectedComponents)
|
connectedComponents
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -1,5 +1,7 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
@ -8,36 +10,37 @@ import org.junit.Test;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
public class SparkCreateDedupTest {
|
public class SparkCreateDedupTest {
|
||||||
|
|
||||||
|
String configuration;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
FileUtils.deleteDirectory(new File("/tmp/pub_dedup_vertex"));
|
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub.curr.conf.json"));
|
||||||
FileUtils.deleteDirectory(new File("/tmp/pub_dedup_rels"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Ignore
|
||||||
public void dedupTest() throws Exception {
|
public void createSimRelsTest() throws Exception {
|
||||||
final String configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
|
|
||||||
|
|
||||||
|
|
||||||
SparkCreateSimRels.main(new String[] {
|
SparkCreateSimRels.main(new String[] {
|
||||||
"-mt", "local[*]",
|
"-mt", "local[*]",
|
||||||
"-s", "/home/sandro/betadump",
|
"-s", "/Users/miconis/dumps",
|
||||||
"-e", "publication",
|
"-e", "publication",
|
||||||
"-c", configuration,
|
"-c", configuration,
|
||||||
"-t", "/tmp/dedup",
|
"-t", "/tmp/dedup",
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Ignore
|
||||||
|
public void createCCTest() throws Exception {
|
||||||
|
|
||||||
SparkCreateConnectedComponent.main(new String[] {
|
SparkCreateConnectedComponent.main(new String[] {
|
||||||
"-mt", "local[*]",
|
"-mt", "local[*]",
|
||||||
"-s", "/home/sandro/betadump",
|
"-s", "/Users/miconis/dumps",
|
||||||
"-e", "publication",
|
"-e", "publication",
|
||||||
"-c", configuration,
|
"-c", configuration,
|
||||||
"-t", "/tmp/dedup",
|
"-t", "/tmp/dedup",
|
||||||
|
@ -49,14 +52,10 @@ public class SparkCreateDedupTest {
|
||||||
public void dedupRecordTest() throws Exception {
|
public void dedupRecordTest() throws Exception {
|
||||||
SparkCreateDedupRecord.main(new String[] {
|
SparkCreateDedupRecord.main(new String[] {
|
||||||
"-mt", "local[*]",
|
"-mt", "local[*]",
|
||||||
"-s", "/home/sandro/betadump",
|
"-s", "/Users/miconis/dumps",
|
||||||
"-e", "publication",
|
"-e", "publication",
|
||||||
"-c", "configuration",
|
"-c", configuration,
|
||||||
"-t", "/tmp/dedup",
|
"-t", "/tmp/dedup",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
"queueMaxSize": "2000",
|
"queueMaxSize": "2000",
|
||||||
"groupMaxSize": "50",
|
"groupMaxSize": "50",
|
||||||
"slidingWindowSize": "200",
|
"slidingWindowSize": "200",
|
||||||
"idPath": "$.id",
|
"idPath": ".id",
|
||||||
"rootBuilder": [
|
"rootBuilder": [
|
||||||
"organization",
|
"organization",
|
||||||
"projectOrganization_participation_isParticipant",
|
"projectOrganization_participation_isParticipant",
|
||||||
|
@ -117,14 +117,6 @@
|
||||||
"host": 0.5,
|
"host": 0.5,
|
||||||
"path": 0.5
|
"path": 0.5
|
||||||
}
|
}
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "gridid",
|
|
||||||
"algo": "Null",
|
|
||||||
"type": "String",
|
|
||||||
"weight": "0.0",
|
|
||||||
"ignoreMissing": "true",
|
|
||||||
"path": ".pid[] | select(.qualifier.classid==\"grid\") | .value"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"blacklists": {
|
"blacklists": {
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
<configuration>
|
<configuration>
|
||||||
<property>
|
<property>
|
||||||
<name>jobTracker</name>
|
<name>jobTracker</name>
|
||||||
<value>yarnRM</value>
|
<dedupId>yarnRM</dedupId>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>nameNode</name>
|
<name>nameNode</name>
|
||||||
<value>hdfs://nameservice1</value>
|
<dedupId>hdfs://nameservice1</dedupId>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sourceNN</name>
|
<name>sourceNN</name>
|
||||||
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
|
<dedupId>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</dedupId>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.use.system.libpath</name>
|
<name>oozie.use.system.libpath</name>
|
||||||
<value>true</value>
|
<dedupId>true</dedupId>
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -14,12 +14,12 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hbase_dump_distcp_memory_mb</name>
|
<name>hbase_dump_distcp_memory_mb</name>
|
||||||
<value>6144</value>
|
<dedupId>6144</dedupId>
|
||||||
<description>memory for distcp action copying InfoSpace dump from remote cluster</description>
|
<description>memory for distcp action copying InfoSpace dump from remote cluster</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hbase_dump_distcp_num_maps</name>
|
<name>hbase_dump_distcp_num_maps</name>
|
||||||
<value>1</value>
|
<dedupId>1</dedupId>
|
||||||
<description>maximum number of simultaneous copies of InfoSpace dump from remote location</description>
|
<description>maximum number of simultaneous copies of InfoSpace dump from remote location</description>
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
|
@ -1,26 +1,26 @@
|
||||||
<configuration>
|
<configuration>
|
||||||
<property>
|
<property>
|
||||||
<name>jobTracker</name>
|
<name>jobTracker</name>
|
||||||
<value>yarnRM</value>
|
<dedupId>yarnRM</dedupId>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>nameNode</name>
|
<name>nameNode</name>
|
||||||
<value>hdfs://nameservice1</value>
|
<dedupId>hdfs://nameservice1</dedupId>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.use.system.libpath</name>
|
<name>oozie.use.system.libpath</name>
|
||||||
<value>true</value>
|
<dedupId>true</dedupId>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<value>spark2</value>
|
<dedupId>spark2</dedupId>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hive_metastore_uris</name>
|
<name>hive_metastore_uris</name>
|
||||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
<dedupId>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</dedupId>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hive_db_name</name>
|
<name>hive_db_name</name>
|
||||||
<value>openaire</value>
|
<dedupId>openaire</dedupId>
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -54,7 +54,7 @@ Properties overriding order is the following:
|
||||||
2. `~/.dhp/application.properties` defined properties
|
2. `~/.dhp/application.properties` defined properties
|
||||||
3. `${workflow.source.dir}/job.properties`
|
3. `${workflow.source.dir}/job.properties`
|
||||||
4. `job-override.properties` (located in the project root dir)
|
4. `job-override.properties` (located in the project root dir)
|
||||||
5. `maven -Dparam=value`
|
5. `maven -Dparam=dedupId`
|
||||||
|
|
||||||
where the maven `-Dparam` property is overriding all the other ones.
|
where the maven `-Dparam` property is overriding all the other ones.
|
||||||
|
|
||||||
|
@ -73,7 +73,7 @@ Workflow definition requirements
|
||||||
|
|
||||||
This property can be set using maven `-D` switch.
|
This property can be set using maven `-D` switch.
|
||||||
|
|
||||||
`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value.
|
`[oozie_app]` is the default directory name however it can be set to any dedupId as soon as `oozieAppDir` property is provided with directory name as dedupId.
|
||||||
|
|
||||||
Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory.
|
Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory.
|
||||||
|
|
||||||
|
|
|
@ -73,7 +73,7 @@
|
||||||
<!-- This profile sets properties that are required for test oozie workflows To be used only with 'oozie-package' profile -->
|
<!-- This profile sets properties that are required for test oozie workflows To be used only with 'oozie-package' profile -->
|
||||||
<id>attach-test-resources</id>
|
<id>attach-test-resources</id>
|
||||||
<properties>
|
<properties>
|
||||||
<!--overriding default scope (set to 'runtime') with the 'test' value. Test resources attached to oozie package requires all test dependencies. -->
|
<!--overriding default scope (set to 'runtime') with the 'test' dedupId. Test resources attached to oozie package requires all test dependencies. -->
|
||||||
<oozie.package.dependencies.include.scope />
|
<oozie.package.dependencies.include.scope />
|
||||||
<oozie.package.dependencies.exclude.scope>provided</oozie.package.dependencies.exclude.scope>
|
<oozie.package.dependencies.exclude.scope>provided</oozie.package.dependencies.exclude.scope>
|
||||||
<!-- Do not skip creation of test jar for priming (in oozie-package profile) -->
|
<!-- Do not skip creation of test jar for priming (in oozie-package profile) -->
|
||||||
|
@ -326,7 +326,7 @@
|
||||||
</goals>
|
</goals>
|
||||||
<configuration>
|
<configuration>
|
||||||
<tasks>
|
<tasks>
|
||||||
<property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
|
<property name="assembly-resources.loc" dedupId="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
|
||||||
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
|
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
|
||||||
</tasks>
|
</tasks>
|
||||||
</configuration>
|
</configuration>
|
||||||
|
|
Loading…
Reference in New Issue