forked from D-Net/dnet-hadoop
implemented DedupRecord factory with the merge of publications
This commit is contained in:
parent
4b66b471a4
commit
abd9034da0
|
@ -18,4 +18,5 @@
|
||||||
/*/build
|
/*/build
|
||||||
/build
|
/build
|
||||||
spark-warehouse
|
spark-warehouse
|
||||||
/dhp-workflows/dhp-graph-mapper/job-override.properties
|
/*/*/job-override.properties
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,10 @@
|
||||||
<groupId>commons-cli</groupId>
|
<groupId>commons-cli</groupId>
|
||||||
<artifactId>commons-cli</artifactId>
|
<artifactId>commons-cli</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-io</groupId>
|
||||||
|
<artifactId>commons-io</artifactId>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
@ -29,21 +33,15 @@
|
||||||
<groupId>javax.persistence</groupId>
|
<groupId>javax.persistence</groupId>
|
||||||
<artifactId>javax.persistence-api</artifactId>
|
<artifactId>javax.persistence-api</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
|
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.rabbitmq</groupId>
|
<groupId>com.rabbitmq</groupId>
|
||||||
<artifactId>amqp-client</artifactId>
|
<artifactId>amqp-client</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>commons-io</groupId>
|
|
||||||
<artifactId>commons-io</artifactId>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -2,17 +2,25 @@ package eu.dnetlib.dhp.application;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import org.apache.commons.cli.*;
|
import org.apache.commons.cli.*;
|
||||||
|
import org.apache.commons.codec.binary.Base64;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Arrays;
|
import java.io.StringWriter;
|
||||||
import java.util.HashMap;
|
import java.util.*;
|
||||||
import java.util.Map;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
import java.util.zip.Inflater;
|
||||||
|
|
||||||
public class ArgumentApplicationParser implements Serializable {
|
public class ArgumentApplicationParser implements Serializable {
|
||||||
|
|
||||||
private final Options options = new Options();
|
private final Options options = new Options();
|
||||||
private final Map<String, String> objectMap = new HashMap<>();
|
private final Map<String, String> objectMap = new HashMap<>();
|
||||||
|
|
||||||
|
private final List<String> compressedValues = new ArrayList<>();
|
||||||
|
|
||||||
public ArgumentApplicationParser(final String json_configuration) throws Exception {
|
public ArgumentApplicationParser(final String json_configuration) throws Exception {
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
|
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
|
||||||
|
@ -29,6 +37,9 @@ public class ArgumentApplicationParser implements Serializable {
|
||||||
final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
|
final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
|
||||||
o.setLongOpt(conf.getParamLongName());
|
o.setLongOpt(conf.getParamLongName());
|
||||||
o.setRequired(conf.isParamRequired());
|
o.setRequired(conf.isParamRequired());
|
||||||
|
if (conf.isCompressed()) {
|
||||||
|
compressedValues.add(conf.getParamLongName());
|
||||||
|
}
|
||||||
return o;
|
return o;
|
||||||
}).forEach(options::addOption);
|
}).forEach(options::addOption);
|
||||||
|
|
||||||
|
@ -38,10 +49,32 @@ public class ArgumentApplicationParser implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static String decompressValue(final String abstractCompressed) {
|
||||||
|
try {
|
||||||
|
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
|
||||||
|
GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
|
||||||
|
final StringWriter stringWriter = new StringWriter();
|
||||||
|
IOUtils.copy(gis, stringWriter);
|
||||||
|
return stringWriter.toString();
|
||||||
|
} catch (Throwable e) {
|
||||||
|
System.out.println("Wrong value to decompress:" + abstractCompressed);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String compressArgument(final String value) throws Exception{
|
||||||
|
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||||
|
GZIPOutputStream gzip = new GZIPOutputStream(out);
|
||||||
|
gzip.write(value.getBytes());
|
||||||
|
gzip.close();
|
||||||
|
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
|
||||||
|
}
|
||||||
|
|
||||||
public void parseArgument(final String[] args) throws Exception {
|
public void parseArgument(final String[] args) throws Exception {
|
||||||
CommandLineParser parser = new BasicParser();
|
CommandLineParser parser = new BasicParser();
|
||||||
CommandLine cmd = parser.parse(options, args);
|
CommandLine cmd = parser.parse(options, args);
|
||||||
Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), it.getValue()));
|
Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), compressedValues.contains(it.getLongOpt())? decompressValue(it.getValue()): it.getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public String get(final String key) {
|
public String get(final String key) {
|
||||||
|
|
|
@ -7,6 +7,7 @@ public class OptionsParameter {
|
||||||
private String paramLongName;
|
private String paramLongName;
|
||||||
private String paramDescription;
|
private String paramDescription;
|
||||||
private boolean paramRequired;
|
private boolean paramRequired;
|
||||||
|
private boolean compressed;
|
||||||
|
|
||||||
public OptionsParameter() {
|
public OptionsParameter() {
|
||||||
}
|
}
|
||||||
|
@ -26,4 +27,12 @@ public class OptionsParameter {
|
||||||
public boolean isParamRequired() {
|
public boolean isParamRequired() {
|
||||||
return paramRequired;
|
return paramRequired;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isCompressed() {
|
||||||
|
return compressed;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCompressed(boolean compressed) {
|
||||||
|
this.compressed = compressed;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,10 @@ package eu.dnetlib.dhp.application;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.util.Base64;
|
||||||
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
|
||||||
|
@ -24,6 +28,7 @@ public class ArgumentApplicationParserTest {
|
||||||
"-ro", "value7",
|
"-ro", "value7",
|
||||||
"-rr", "value8",
|
"-rr", "value8",
|
||||||
"-w", "value9",
|
"-w", "value9",
|
||||||
|
"-cc", ArgumentApplicationParser.compressArgument(jsonConfiguration)
|
||||||
});
|
});
|
||||||
assertNotNull(parser.get("hdfsPath"));
|
assertNotNull(parser.get("hdfsPath"));
|
||||||
assertNotNull(parser.get("apidescriptor"));
|
assertNotNull(parser.get("apidescriptor"));
|
||||||
|
@ -45,7 +50,12 @@ public class ArgumentApplicationParserTest {
|
||||||
assertEquals("value7", parser.get("rabbitOngoingQueue"));
|
assertEquals("value7", parser.get("rabbitOngoingQueue"));
|
||||||
assertEquals("value8", parser.get("rabbitReportQueue"));
|
assertEquals("value8", parser.get("rabbitReportQueue"));
|
||||||
assertEquals("value9", parser.get("workflowId"));
|
assertEquals("value9", parser.get("workflowId"));
|
||||||
|
assertEquals(jsonConfiguration, parser.get("ccCoco"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
[
|
[
|
||||||
{"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true},
|
{"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true},
|
||||||
{"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true},
|
{"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true},
|
||||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true},
|
||||||
{"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true},
|
{"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true},
|
||||||
{"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true},
|
{"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true},
|
||||||
{"paramName":"rp", "paramLongName":"rabbitPassWord", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true},
|
{"paramName":"rp", "paramLongName":"rabbitPassWord", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true},
|
||||||
{"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true},
|
{"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true},
|
||||||
{"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true},
|
{"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true},
|
||||||
{"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true},
|
{"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true},
|
||||||
{"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true}
|
{"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true},
|
||||||
|
{"paramName":"cc", "paramLongName":"ccCoco", "paramDescription": "the identifier of the dnet Workflow", "compressed":true,"paramRequired": true}
|
||||||
]
|
]
|
|
@ -84,4 +84,31 @@ public class Instance implements Serializable {
|
||||||
public void setDateofacceptance(Field<String> dateofacceptance) {
|
public void setDateofacceptance(Field<String> dateofacceptance) {
|
||||||
this.dateofacceptance = dateofacceptance;
|
this.dateofacceptance = dateofacceptance;
|
||||||
}
|
}
|
||||||
|
public String toComparableString(){
|
||||||
|
return String.format("%s::%s::%s::%s",
|
||||||
|
hostedby != null && hostedby.getKey()!= null ? hostedby.getKey().toLowerCase() : "",
|
||||||
|
accessright!= null && accessright.getClassid()!= null ? accessright.getClassid() : "",
|
||||||
|
instancetype!= null && instancetype.getClassid()!= null ? instancetype.getClassid() : "",
|
||||||
|
url != null ? url:"");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Instance other = (Instance) obj;
|
||||||
|
|
||||||
|
return toComparableString()
|
||||||
|
.equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,8 @@ import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public abstract class Result extends OafEntity implements Serializable {
|
public abstract class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
|
@ -251,13 +253,12 @@ public abstract class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
Result r = (Result) e;
|
Result r = (Result) e;
|
||||||
|
|
||||||
mergeAuthors(r.getAuthor());
|
|
||||||
|
|
||||||
//TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function)
|
//TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function)
|
||||||
// if (author == null)
|
|
||||||
// author = r.getAuthor(); //authors will be replaced because they could be too much
|
|
||||||
// dateofacceptance = r.getDateofacceptance();
|
// dateofacceptance = r.getDateofacceptance();
|
||||||
// instance = mergeLists(instance, r.getInstance());
|
|
||||||
|
instance = mergeLists(instance, r.getInstance());
|
||||||
|
|
||||||
if (r.getResulttype() != null)
|
if (r.getResulttype() != null)
|
||||||
resulttype = r.getResulttype();
|
resulttype = r.getResulttype();
|
||||||
|
@ -309,80 +310,5 @@ public abstract class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void mergeAuthors(List<Author> authors){
|
|
||||||
int c1 = countAuthorsPids(author);
|
|
||||||
int c2 = countAuthorsPids(authors);
|
|
||||||
int s1 = authorsSize(author);
|
|
||||||
int s2 = authorsSize(authors);
|
|
||||||
|
|
||||||
|
|
||||||
//if both have no authors with pids and authors is bigger than author
|
|
||||||
if (c1 == 0 && c2 == 0 && author.size()<authors.size()) {
|
|
||||||
author = authors;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//author is null and authors have 0 or more authors with pids
|
|
||||||
if (c1<c2 && c1<0) {
|
|
||||||
author = authors;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//andiamo a mangiare
|
|
||||||
|
|
||||||
|
|
||||||
// if (author == null && authors == null)
|
|
||||||
// return;
|
|
||||||
//
|
|
||||||
// int c1 = countAuthorsPids(author);
|
|
||||||
// int c2 = countAuthorsPids(authors);
|
|
||||||
//
|
|
||||||
// if (c1<c2 && c1<1){
|
|
||||||
// author = authors;
|
|
||||||
// return;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// if (c1<c2)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public int countAuthorsPids(List<Author> authors){
|
|
||||||
if (authors == null)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
return (int) authors.stream().map(this::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int authorsSize(List<Author> authors){
|
|
||||||
if (authors == null)
|
|
||||||
return 0;
|
|
||||||
return authors.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String extractAuthorPid(Author a){
|
|
||||||
|
|
||||||
if(a == null || a.getPid() == null || a.getPid().size() == 0)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
StringBuilder mainPid = new StringBuilder();
|
|
||||||
|
|
||||||
a.getPid().forEach(pid ->{
|
|
||||||
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) {
|
|
||||||
mainPid.setLength(0);
|
|
||||||
mainPid.append(pid.getValue());
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if(mainPid.length() == 0)
|
|
||||||
mainPid.append(pid.getValue());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return mainPid.toString();
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,7 @@
|
||||||
<xsl:copy-of select="//oai:header"/>
|
<xsl:copy-of select="//oai:header"/>
|
||||||
<metadata>
|
<metadata>
|
||||||
<xsl:for-each select="//*[local-name()='subject']">
|
<xsl:for-each select="//*[local-name()='subject']">
|
||||||
<subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
|
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</metadata>
|
</metadata>
|
||||||
<oaf:about>
|
<oaf:about>
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<RESOURCE_PROFILE>
|
<RESOURCE_PROFILE>
|
||||||
<HEADER>
|
<HEADER>
|
||||||
<RESOURCE_IDENTIFIER dedupId="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
|
<RESOURCE_IDENTIFIER value="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
|
||||||
<RESOURCE_TYPE dedupId="TransformationRuleDSResourceType"/>
|
<RESOURCE_TYPE value="TransformationRuleDSResourceType"/>
|
||||||
<RESOURCE_KIND dedupId="TransformationRuleDSResources"/>
|
<RESOURCE_KIND value="TransformationRuleDSResources"/>
|
||||||
<RESOURCE_URI dedupId=""/>
|
<RESOURCE_URI value=""/>
|
||||||
<DATE_OF_CREATION dedupId="2019-04-11T11:15:30+00:00"/>
|
<DATE_OF_CREATION value="2019-04-11T11:15:30+00:00"/>
|
||||||
</HEADER>
|
</HEADER>
|
||||||
<BODY>
|
<BODY>
|
||||||
<CONFIGURATION>
|
<CONFIGURATION>
|
||||||
|
@ -24,7 +24,7 @@
|
||||||
<xsl:copy-of select="//oai:header"/>
|
<xsl:copy-of select="//oai:header"/>
|
||||||
<metadata>
|
<metadata>
|
||||||
<xsl:for-each select="//*[local-name()='subject']">
|
<xsl:for-each select="//*[local-name()='subject']">
|
||||||
<subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
|
<subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</metadata>
|
</metadata>
|
||||||
<oaf:about>
|
<oaf:about>
|
||||||
|
|
|
@ -0,0 +1,119 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
import java.time.Year;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import static java.util.Collections.reverseOrder;
|
||||||
|
import static java.util.Map.Entry.comparingByValue;
|
||||||
|
import static java.util.stream.Collectors.toMap;
|
||||||
|
import static org.apache.commons.lang.StringUtils.endsWith;
|
||||||
|
import static org.apache.commons.lang.StringUtils.substringBefore;
|
||||||
|
|
||||||
|
public class DatePicker {
|
||||||
|
|
||||||
|
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
|
||||||
|
private static final String DATE_DEFAULT_SUFFIX = "01-01";
|
||||||
|
private static final int YEAR_LB = 1300;
|
||||||
|
private static final int YEAR_UB = Year.now().getValue() + 5;
|
||||||
|
|
||||||
|
public static Field<String> pick(final Collection<String> dateofacceptance) {
|
||||||
|
|
||||||
|
final Map<String, Integer> frequencies = dateofacceptance
|
||||||
|
.parallelStream()
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.collect(
|
||||||
|
Collectors.toConcurrentMap(
|
||||||
|
w -> w, w -> 1, Integer::sum));
|
||||||
|
|
||||||
|
if (frequencies.isEmpty()) {
|
||||||
|
return new Field<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
final Field<String> date = new Field<>();
|
||||||
|
date.setValue(frequencies.keySet().iterator().next());
|
||||||
|
|
||||||
|
// let's sort this map by values first, filtering out invalid dates
|
||||||
|
final Map<String, Integer> sorted = frequencies
|
||||||
|
.entrySet()
|
||||||
|
.stream()
|
||||||
|
.filter(d -> StringUtils.isNotBlank(d.getKey()))
|
||||||
|
.filter(d -> d.getKey().matches(DATE_PATTERN))
|
||||||
|
.filter(d -> inRange(d.getKey()))
|
||||||
|
.sorted(reverseOrder(comparingByValue()))
|
||||||
|
.collect(
|
||||||
|
toMap(
|
||||||
|
Map.Entry::getKey,
|
||||||
|
Map.Entry::getValue, (e1, e2) -> e2,
|
||||||
|
LinkedHashMap::new));
|
||||||
|
|
||||||
|
// shortcut
|
||||||
|
if (sorted.size() == 0) {
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
// voting method (1/3 + 1) wins
|
||||||
|
if (sorted.size() >= 3) {
|
||||||
|
final int acceptThreshold = (sorted.size() / 3) + 1;
|
||||||
|
final List<String> accepted = sorted.entrySet().stream()
|
||||||
|
.filter(e -> e.getValue() >= acceptThreshold)
|
||||||
|
.map(e -> e.getKey())
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
// cannot find strong majority
|
||||||
|
if (accepted.isEmpty()) {
|
||||||
|
final int max = sorted.values().iterator().next();
|
||||||
|
Optional<String> first = sorted.entrySet().stream()
|
||||||
|
.filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.findFirst();
|
||||||
|
if (first.isPresent()) {
|
||||||
|
date.setValue(first.get());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
date.setValue(sorted.keySet().iterator().next());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (accepted.size() == 1) {
|
||||||
|
date.setValue(accepted.get(0));
|
||||||
|
return date;
|
||||||
|
} else {
|
||||||
|
final Optional<String> first = accepted.stream()
|
||||||
|
.filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX))
|
||||||
|
.findFirst();
|
||||||
|
if (first.isPresent()) {
|
||||||
|
date.setValue(first.get());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
//1st non YYYY-01-01 is returned
|
||||||
|
} else {
|
||||||
|
if (sorted.size() == 2) {
|
||||||
|
for (Map.Entry<String, Integer> e : sorted.entrySet()) {
|
||||||
|
if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
|
||||||
|
date.setValue(e.getKey());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// none of the dates seems good enough, return the 1st one
|
||||||
|
date.setValue(sorted.keySet().iterator().next());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean inRange(final String date) {
|
||||||
|
final int year = Integer.parseInt(substringBefore(date, "-"));
|
||||||
|
return year >= YEAR_LB && year <= YEAR_UB;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,6 +1,5 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
@ -10,23 +9,20 @@ import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.Function;
|
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
import static java.util.stream.Collectors.toMap;
|
import static java.util.stream.Collectors.toMap;
|
||||||
|
|
||||||
public class DedupRecordFactory {
|
public class DedupRecordFactory {
|
||||||
|
|
||||||
public JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){
|
public static JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){
|
||||||
|
|
||||||
//<id, json_entity>
|
//<id, json_entity>
|
||||||
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
|
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
|
||||||
|
@ -51,7 +47,12 @@ public class DedupRecordFactory {
|
||||||
|
|
||||||
String idValue = json._1();
|
String idValue = json._1();
|
||||||
|
|
||||||
String trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2());
|
String trust ="";
|
||||||
|
try {
|
||||||
|
trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2());
|
||||||
|
} catch (Throwable e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
//TODO remember to replace this with the actual trust retrieving
|
//TODO remember to replace this with the actual trust retrieving
|
||||||
if (StringUtils.isBlank(trust)) {
|
if (StringUtils.isBlank(trust)) {
|
||||||
|
@ -71,28 +72,32 @@ public class DedupRecordFactory {
|
||||||
.groupByKey();
|
.groupByKey();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
switch(entityType){
|
switch(entityType){
|
||||||
case Publication:
|
case publication:
|
||||||
return sortedJoinResult.map(this::publicationMerger);
|
return sortedJoinResult.map(DedupRecordFactory::publicationMerger);
|
||||||
case Dataset:
|
case dataset:
|
||||||
return sortedJoinResult.map(this::datasetMerger);
|
return sortedJoinResult.map(DedupRecordFactory::datasetMerger);
|
||||||
case Project:
|
case project:
|
||||||
return sortedJoinResult.map(this::projectMerger);
|
return sortedJoinResult.map(DedupRecordFactory::projectMerger);
|
||||||
case Software:
|
case software:
|
||||||
return sortedJoinResult.map(this::softwareMerger);
|
return sortedJoinResult.map(DedupRecordFactory::softwareMerger);
|
||||||
case Datasource:
|
case datasource:
|
||||||
return sortedJoinResult.map(this::datasourceMerger);
|
return sortedJoinResult.map(DedupRecordFactory::datasourceMerger);
|
||||||
case Organization:
|
case organization:
|
||||||
return sortedJoinResult.map(this::organizationMerger);
|
return sortedJoinResult.map(DedupRecordFactory::organizationMerger);
|
||||||
case OtherResearchProduct:
|
case otherresearchproduct:
|
||||||
return sortedJoinResult.map(this::otherresearchproductMerger);
|
return sortedJoinResult.map(DedupRecordFactory::otherresearchproductMerger);
|
||||||
default:
|
default:
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Publication publicationMerger(Tuple2<String, Iterable<String>> e){
|
private static Publication publicationMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
Publication p = new Publication(); //the result of the merge, to be returned at the end
|
Publication p = new Publication(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
@ -101,67 +106,59 @@ public class DedupRecordFactory {
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||||
final Collection<List<Author>> authors = Lists.newArrayList();
|
|
||||||
final Collection<List<Instance>> instances = Lists.newArrayList();
|
|
||||||
|
|
||||||
StringBuilder trust = new StringBuilder("0.0");
|
StringBuilder trust = new StringBuilder("0.0");
|
||||||
|
|
||||||
|
if (e._2() != null)
|
||||||
e._2().forEach(pub -> {
|
e._2().forEach(pub -> {
|
||||||
try {
|
try {
|
||||||
Publication publication = mapper.readValue(pub, Publication.class);
|
Publication publication = mapper.readValue(pub, Publication.class);
|
||||||
|
|
||||||
final String currentTrust = publication.getDataInfo().getTrust();
|
final String currentTrust = publication.getDataInfo().getTrust();
|
||||||
if (!currentTrust.equals("1.0")) {
|
if (!"1.0".equals(currentTrust)) {
|
||||||
trust.setLength(0);
|
trust.setLength(0);
|
||||||
trust.append(currentTrust);
|
trust.append(currentTrust);
|
||||||
}
|
}
|
||||||
|
|
||||||
p.mergeFrom(publication);
|
p.mergeFrom(publication);
|
||||||
|
p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
|
||||||
//add to the list if they are not null
|
//add to the list if they are not null
|
||||||
if (publication.getDateofacceptance() != null)
|
if (publication.getDateofacceptance() != null)
|
||||||
dateofacceptance.add(publication.getDateofacceptance().getValue());
|
dateofacceptance.add(publication.getDateofacceptance().getValue());
|
||||||
if (publication.getAuthor() != null)
|
} catch (Exception exc){
|
||||||
authors.add(publication.getAuthor());
|
throw new RuntimeException(exc);
|
||||||
if (publication.getInstance() != null)
|
}
|
||||||
instances.add(publication.getInstance());
|
|
||||||
|
|
||||||
} catch (Exception exc){}
|
|
||||||
|
|
||||||
});
|
});
|
||||||
|
p.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||||
p.setAuthor(null); //TODO create a single list of authors to put in the final publication
|
|
||||||
|
|
||||||
|
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Dataset datasetMerger(Tuple2<String, Iterable<String>> e){
|
private static Dataset datasetMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
private Project projectMerger(Tuple2<String, Iterable<String>> e){
|
private static Project projectMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
private Software softwareMerger(Tuple2<String, Iterable<String>> e){
|
private static Software softwareMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
private Datasource datasourceMerger(Tuple2<String, Iterable<String>> e){
|
private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
private Organization organizationMerger(Tuple2<String, Iterable<String>> e){
|
private static Organization organizationMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
private OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e){
|
private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e){
|
||||||
|
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,17 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
import eu.dnetlib.pace.model.Person;
|
||||||
import org.apache.commons.codec.binary.Hex;
|
import org.apache.commons.codec.binary.Hex;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
@ -14,32 +20,35 @@ import org.apache.spark.SparkContext;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.security.MessageDigest;
|
import java.security.MessageDigest;
|
||||||
import java.util.HashMap;
|
import java.text.Normalizer;
|
||||||
import java.util.Map;
|
import java.util.*;
|
||||||
import java.util.Set;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
public class DedupUtility {
|
public class DedupUtility {
|
||||||
|
private static final Double THRESHOLD = 0.95;
|
||||||
|
|
||||||
public static Map<String, LongAccumulator> constructAccumulator(final DedupConfig dedupConf, final SparkContext context) {
|
public static Map<String, LongAccumulator> constructAccumulator(final DedupConfig dedupConf, final SparkContext context) {
|
||||||
|
|
||||||
Map<String, LongAccumulator> accumulators = new HashMap<>();
|
Map<String, LongAccumulator> accumulators = new HashMap<>();
|
||||||
|
|
||||||
String acc1 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1");
|
String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
|
||||||
accumulators.put(acc1, context.longAccumulator(acc1));
|
accumulators.put(acc1, context.longAccumulator(acc1));
|
||||||
String acc2 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
|
String acc2 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
|
||||||
accumulators.put(acc2, context.longAccumulator(acc2));
|
accumulators.put(acc2, context.longAccumulator(acc2));
|
||||||
String acc3 = String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
|
String acc3 = String.format("%s::%s", dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
|
||||||
accumulators.put(acc3, context.longAccumulator(acc3));
|
accumulators.put(acc3, context.longAccumulator(acc3));
|
||||||
String acc4 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list");
|
String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
|
||||||
accumulators.put(acc4, context.longAccumulator(acc4));
|
accumulators.put(acc4, context.longAccumulator(acc4));
|
||||||
String acc5 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
|
String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
|
||||||
accumulators.put(acc5, context.longAccumulator(acc5));
|
accumulators.put(acc5, context.longAccumulator(acc5));
|
||||||
String acc6 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
|
String acc6 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
|
||||||
accumulators.put(acc6, context.longAccumulator(acc6));
|
accumulators.put(acc6, context.longAccumulator(acc6));
|
||||||
|
|
||||||
return accumulators;
|
return accumulators;
|
||||||
|
@ -52,7 +61,7 @@ public class DedupUtility {
|
||||||
public static void deleteIfExists(String path) throws IOException {
|
public static void deleteIfExists(String path) throws IOException {
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
FileSystem fileSystem = FileSystem.get(conf);
|
FileSystem fileSystem = FileSystem.get(conf);
|
||||||
if (fileSystem.exists(new Path(path))){
|
if (fileSystem.exists(new Path(path))) {
|
||||||
fileSystem.delete(new Path(path), true);
|
fileSystem.delete(new Path(path), true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -91,4 +100,171 @@ public class DedupUtility {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
||||||
|
int pa = countAuthorsPids(a);
|
||||||
|
int pb = countAuthorsPids(b);
|
||||||
|
List<Author> base, enrich;
|
||||||
|
int sa = authorsSize(a);
|
||||||
|
int sb = authorsSize(b);
|
||||||
|
|
||||||
|
if(pa == pb){
|
||||||
|
base = sa>sb?a:b;
|
||||||
|
enrich = sa>sb?b:a;
|
||||||
|
} else {
|
||||||
|
base = pa>pb?a:b;
|
||||||
|
enrich = pa>pb?b:a;
|
||||||
|
}
|
||||||
|
enrichPidFromList(base, enrich);
|
||||||
|
return base;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// //if both have no authors with pids
|
||||||
|
// if (pa < 1 && pb < 1) {
|
||||||
|
// //B is bigger than A
|
||||||
|
// if (sa < sb)
|
||||||
|
// return b;
|
||||||
|
// //A is bigger than B
|
||||||
|
// else
|
||||||
|
// return a;
|
||||||
|
// }
|
||||||
|
// //If A has author with pids
|
||||||
|
// if (pa > 0) {
|
||||||
|
// //B has no author with pid
|
||||||
|
// if (pb < 1)
|
||||||
|
// return a;
|
||||||
|
// //B has author with pid
|
||||||
|
// else {
|
||||||
|
// enrichPidFromList(a, b);
|
||||||
|
// return a;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// //If B has author with pids
|
||||||
|
// //A has no author with pid
|
||||||
|
// if (pa < 1)
|
||||||
|
// return b;
|
||||||
|
// //A has author with pid
|
||||||
|
// else {
|
||||||
|
// enrichPidFromList(b, a);
|
||||||
|
// return b;
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
||||||
|
if(base==null || enrich == null)
|
||||||
|
return;
|
||||||
|
final Map<String, Author> basePidAuthorMap = base.stream()
|
||||||
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
|
.flatMap(a -> a.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(p -> new Tuple2<>(p.toComparableString(), a))
|
||||||
|
).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
|
||||||
|
|
||||||
|
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
||||||
|
.stream()
|
||||||
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
|
.flatMap(a -> a.getPid().stream().filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())).map(p -> new Tuple2<>(p, a)))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
|
||||||
|
pidToEnrich.forEach(a -> {
|
||||||
|
Optional<Tuple2<Double, Author>> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1));
|
||||||
|
if (simAuhtor.isPresent() && simAuhtor.get()._1()> THRESHOLD) {
|
||||||
|
Author r = simAuhtor.get()._2();
|
||||||
|
r.getPid().add(a._1());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String createEntityPath(final String basePath, final String entityType) {
|
||||||
|
return String.format("%s/%s", basePath,entityType);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String createSimRelPath(final String basePath, final String entityType) {
|
||||||
|
return String.format("%s/%s_simRel", basePath,entityType);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String createMergeRelPath(final String basePath, final String entityType) {
|
||||||
|
return String.format("%s/%s_mergeRel", basePath,entityType);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double sim(Author a, Author b) {
|
||||||
|
|
||||||
|
final Person pa = parse(a);
|
||||||
|
final Person pb = parse(b);
|
||||||
|
|
||||||
|
if (pa.isAccurate() & pb.isAccurate()) {
|
||||||
|
return new JaroWinkler().score(
|
||||||
|
normalize(pa.getSurnameString()),
|
||||||
|
normalize(pb.getSurnameString()));
|
||||||
|
} else {
|
||||||
|
return new JaroWinkler().score(
|
||||||
|
normalize(pa.getNormalisedFullname()),
|
||||||
|
normalize(pb.getNormalisedFullname()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String normalize(final String s) {
|
||||||
|
return nfd(s).toLowerCase()
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||||
|
.replaceAll("(\\W)+", " ")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
|
private static Person parse(Author author) {
|
||||||
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
||||||
|
return new Person(author.getSurname() + ", " + author.getName(), false);
|
||||||
|
} else {
|
||||||
|
return new Person(author.getFullname(), false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static int countAuthorsPids(List<Author> authors) {
|
||||||
|
if (authors == null)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return (int) authors.stream().map(DedupUtility::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int authorsSize(List<Author> authors) {
|
||||||
|
if (authors == null)
|
||||||
|
return 0;
|
||||||
|
return authors.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isAccurate(final Author a) {
|
||||||
|
return StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String extractAuthorPid(Author a) {
|
||||||
|
|
||||||
|
if (a == null || a.getPid() == null || a.getPid().size() == 0)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
StringBuilder mainPid = new StringBuilder();
|
||||||
|
|
||||||
|
a.getPid().forEach(pid -> {
|
||||||
|
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) {
|
||||||
|
mainPid.setLength(0);
|
||||||
|
mainPid.append(pid.getValue());
|
||||||
|
} else {
|
||||||
|
if (mainPid.length() == 0)
|
||||||
|
mainPid.append(pid.getValue());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return mainPid.toString();
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,12 +2,14 @@ package eu.dnetlib.dedup;
|
||||||
|
|
||||||
public enum OafEntityType {
|
public enum OafEntityType {
|
||||||
|
|
||||||
Datasource,
|
datasource,
|
||||||
Organization,
|
organization,
|
||||||
Project,
|
project,
|
||||||
Dataset,
|
dataset,
|
||||||
OtherResearchProduct,
|
otherresearchproduct,
|
||||||
Software,
|
software,
|
||||||
Publication
|
publication
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,7 +46,7 @@ public class SparkCreateConnectedComponent {
|
||||||
s -> new Tuple2<Object, String>((long) s.hashCode(), s)
|
s -> new Tuple2<Object, String>((long) s.hashCode(), s)
|
||||||
);
|
);
|
||||||
|
|
||||||
final Dataset<Relation> similarityRelations = spark.read().load(targetPath + "/" + entity+"_simrel").as(Encoders.bean(Relation.class));
|
final Dataset<Relation> similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class));
|
||||||
|
|
||||||
|
|
||||||
final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(it.getSource().hashCode(), it.getTarget().hashCode(), it.getRelClass())).rdd();
|
final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(it.getSource().hashCode(), it.getTarget().hashCode(), it.getRelClass())).rdd();
|
||||||
|
@ -73,7 +73,7 @@ public class SparkCreateConnectedComponent {
|
||||||
return tmp.stream();
|
return tmp.stream();
|
||||||
}).iterator()).rdd(), Encoders.bean(Relation.class));
|
}).iterator()).rdd(), Encoders.bean(Relation.class));
|
||||||
|
|
||||||
mergeRelation.write().mode("overwrite").save(targetPath+"/"+entity+"_mergeRels");
|
mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity));
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,71 +1,39 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
import com.google.common.collect.ComparisonChain;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.Partitioner;
|
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.Optional;
|
|
||||||
import org.apache.spark.api.java.function.Function;
|
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.apache.spark.sql.types.StructType;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class SparkCreateDedupRecord {
|
public class SparkCreateDedupRecord {
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
// final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json")));
|
||||||
// parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
// final SparkSession spark = SparkSession
|
final SparkSession spark = SparkSession
|
||||||
// .builder()
|
.builder()
|
||||||
// .appName(SparkCreateDedupRecord.class.getSimpleName())
|
.appName(SparkCreateDedupRecord.class.getSimpleName())
|
||||||
// .master(parser.get("master"))
|
.master(parser.get("master"))
|
||||||
// .getOrCreate();
|
.getOrCreate();
|
||||||
//
|
|
||||||
// final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
|
||||||
// final String inputPath = parser.get("sourcePath");
|
|
||||||
// final String entity = parser.get("entity");
|
|
||||||
// final String targetPath = parser.get("targetPath");
|
|
||||||
//// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
|
||||||
// final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
|
||||||
//
|
|
||||||
// //<id, json_entity>
|
|
||||||
// final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity)
|
|
||||||
// .mapToPair((PairFunction<String,String,String>)it->
|
|
||||||
// new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
|
|
||||||
// );
|
|
||||||
|
|
||||||
// //<source, target>: source is the dedup_id, target is the id of the mergedIn
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
// JavaPairRDD<String,String> mergeRels = spark
|
final String sourcePath = parser.get("sourcePath");
|
||||||
// .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class))
|
final String entity = parser.get("entity");
|
||||||
// .where("relClass=='merges'")
|
final String dedupPath = parser.get("dedupPath");
|
||||||
// .javaRDD()
|
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||||
// .mapToPair(
|
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||||
// (PairFunction<Relation, String,String>)r->
|
|
||||||
// new Tuple2<String,String>(r.getTarget(), r.getSource())
|
final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf);
|
||||||
// );
|
dedupRecord.map(r-> {
|
||||||
//
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
// //<dedup_id, json_entity_merged>
|
return mapper.writeValueAsString(r);
|
||||||
// final JavaPairRDD<String, String> p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
}).saveAsTextFile(dedupPath+"/"+entity+"_dedup_record_json");
|
||||||
|
|
||||||
StructType schema = Encoders.bean(Publication.class).schema();
|
|
||||||
|
|
||||||
System.out.println(schema);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,7 +77,7 @@ public class SparkCreateSimRels {
|
||||||
return r;
|
return r;
|
||||||
});
|
});
|
||||||
|
|
||||||
spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save(targetPath+"/"+entity+"_simrel");
|
spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "mt",
|
||||||
|
"paramLongName": "master",
|
||||||
|
"paramDescription": "should be local or yarn",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the path of the sequential file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "e",
|
||||||
|
"paramLongName": "entity",
|
||||||
|
"paramDescription": "the type of entity to be deduped",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "c",
|
||||||
|
"paramLongName": "dedupConf",
|
||||||
|
"paramDescription": "dedup configuration to be used",
|
||||||
|
"compressed": true,
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "d",
|
||||||
|
"paramLongName": "dedupPath",
|
||||||
|
"paramDescription": "dedup path to load mergeRelation",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -1,7 +1,33 @@
|
||||||
[
|
[
|
||||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
{
|
||||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequential file to read", "paramRequired": true},
|
"paramName": "mt",
|
||||||
{"paramName":"e", "paramLongName":"entity", "paramDescription": "the type of entity to be deduped", "paramRequired": true},
|
"paramLongName": "master",
|
||||||
{"paramName":"c", "paramLongName":"dedupConf", "paramDescription": "dedup configuration to be used", "paramRequired": true},
|
"paramDescription": "should be local or yarn",
|
||||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "target path to save dedup result", "paramRequired": true}
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the path of the sequential file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "e",
|
||||||
|
"paramLongName": "entity",
|
||||||
|
"paramDescription": "the type of entity to be deduped",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "c",
|
||||||
|
"paramLongName": "dedupConf",
|
||||||
|
"paramDescription": "dedup configuration to be used",
|
||||||
|
"compressed": true,
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "t",
|
||||||
|
"paramLongName": "targetPath",
|
||||||
|
"paramDescription": "target path to save dedup result",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
]
|
]
|
File diff suppressed because one or more lines are too long
|
@ -93,6 +93,31 @@
|
||||||
<arg>--entity</arg><arg>${entity}</arg>
|
<arg>--entity</arg><arg>${entity}</arg>
|
||||||
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
<ok to="CreateDedupRecord"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="CreateDedupRecord">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Create Connected Components</name>
|
||||||
|
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
|
||||||
|
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory} --conf
|
||||||
|
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
|
||||||
|
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
|
||||||
|
spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-mt</arg><arg>yarn-cluster</arg>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--dedupPath</arg><arg>${dedupPath}</arg>
|
||||||
|
<arg>--entity</arg><arg>${entity}</arg>
|
||||||
|
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
||||||
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class MergeAuthorTest {
|
||||||
|
|
||||||
|
List<Publication> publicationsToMerge;
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dedup/json/authors_merge.json"));
|
||||||
|
|
||||||
|
|
||||||
|
publicationsToMerge = Arrays.asList(json.split("\n")).stream().map(s-> {
|
||||||
|
try {
|
||||||
|
return mapper.readValue(s, Publication.class);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}).collect(Collectors.toList());
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws Exception {
|
||||||
|
Publication dedup = new Publication();
|
||||||
|
|
||||||
|
|
||||||
|
publicationsToMerge.forEach(p-> {
|
||||||
|
dedup.mergeFrom(p);
|
||||||
|
dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(),p.getAuthor()));
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
System.out.println(mapper.writeValueAsString(dedup));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.dedup;
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -27,9 +28,9 @@ public class SparkCreateDedupTest {
|
||||||
public void createSimRelsTest() throws Exception {
|
public void createSimRelsTest() throws Exception {
|
||||||
SparkCreateSimRels.main(new String[] {
|
SparkCreateSimRels.main(new String[] {
|
||||||
"-mt", "local[*]",
|
"-mt", "local[*]",
|
||||||
"-s", "/Users/miconis/dumps",
|
"-s", "/home/sandro/betadump",
|
||||||
"-e", "publication",
|
"-e", "publication",
|
||||||
"-c", configuration,
|
"-c", ArgumentApplicationParser.compressArgument(configuration),
|
||||||
"-t", "/tmp/dedup",
|
"-t", "/tmp/dedup",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -40,9 +41,9 @@ public class SparkCreateDedupTest {
|
||||||
|
|
||||||
SparkCreateConnectedComponent.main(new String[] {
|
SparkCreateConnectedComponent.main(new String[] {
|
||||||
"-mt", "local[*]",
|
"-mt", "local[*]",
|
||||||
"-s", "/Users/miconis/dumps",
|
"-s", "/home/sandro/betadump",
|
||||||
"-e", "publication",
|
"-e", "publication",
|
||||||
"-c", configuration,
|
"-c", ArgumentApplicationParser.compressArgument(configuration),
|
||||||
"-t", "/tmp/dedup",
|
"-t", "/tmp/dedup",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -52,10 +53,18 @@ public class SparkCreateDedupTest {
|
||||||
public void dedupRecordTest() throws Exception {
|
public void dedupRecordTest() throws Exception {
|
||||||
SparkCreateDedupRecord.main(new String[] {
|
SparkCreateDedupRecord.main(new String[] {
|
||||||
"-mt", "local[*]",
|
"-mt", "local[*]",
|
||||||
"-s", "/Users/miconis/dumps",
|
"-s", "/home/sandro/betadump",
|
||||||
"-e", "publication",
|
"-e", "publication",
|
||||||
"-c", configuration,
|
"-c", ArgumentApplicationParser.compressArgument(configuration),
|
||||||
"-t", "/tmp/dedup",
|
"-d", "/tmp/dedup",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void printCC() throws Exception {
|
||||||
|
System.out.println(ArgumentApplicationParser.compressArgument(configuration));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,18 +1,18 @@
|
||||||
<configuration>
|
<configuration>
|
||||||
<property>
|
<property>
|
||||||
<name>jobTracker</name>
|
<name>jobTracker</name>
|
||||||
<dedupId>yarnRM</dedupId>
|
<value>yarnRM</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>nameNode</name>
|
<name>nameNode</name>
|
||||||
<dedupId>hdfs://nameservice1</dedupId>
|
<value>hdfs://nameservice1</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sourceNN</name>
|
<name>sourceNN</name>
|
||||||
<dedupId>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</dedupId>
|
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.use.system.libpath</name>
|
<name>oozie.use.system.libpath</name>
|
||||||
<dedupId>true</dedupId>
|
<value>true</value>
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -14,12 +14,12 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hbase_dump_distcp_memory_mb</name>
|
<name>hbase_dump_distcp_memory_mb</name>
|
||||||
<dedupId>6144</dedupId>
|
<value>6144</value>
|
||||||
<description>memory for distcp action copying InfoSpace dump from remote cluster</description>
|
<description>memory for distcp action copying InfoSpace dump from remote cluster</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hbase_dump_distcp_num_maps</name>
|
<name>hbase_dump_distcp_num_maps</name>
|
||||||
<dedupId>1</dedupId>
|
<value>1</value>
|
||||||
<description>maximum number of simultaneous copies of InfoSpace dump from remote location</description>
|
<description>maximum number of simultaneous copies of InfoSpace dump from remote location</description>
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
|
@ -1,26 +1,26 @@
|
||||||
<configuration>
|
<configuration>
|
||||||
<property>
|
<property>
|
||||||
<name>jobTracker</name>
|
<name>jobTracker</name>
|
||||||
<dedupId>yarnRM</dedupId>
|
<value>yarnRM</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>nameNode</name>
|
<name>nameNode</name>
|
||||||
<dedupId>hdfs://nameservice1</dedupId>
|
<value>hdfs://nameservice1</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.use.system.libpath</name>
|
<name>oozie.use.system.libpath</name>
|
||||||
<dedupId>true</dedupId>
|
<value>true</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<dedupId>spark2</dedupId>
|
<value>spark2</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hive_metastore_uris</name>
|
<name>hive_metastore_uris</name>
|
||||||
<dedupId>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</dedupId>
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hive_db_name</name>
|
<name>hive_db_name</name>
|
||||||
<dedupId>openaire</dedupId>
|
<value>openaire</value>
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -54,7 +54,7 @@ Properties overriding order is the following:
|
||||||
2. `~/.dhp/application.properties` defined properties
|
2. `~/.dhp/application.properties` defined properties
|
||||||
3. `${workflow.source.dir}/job.properties`
|
3. `${workflow.source.dir}/job.properties`
|
||||||
4. `job-override.properties` (located in the project root dir)
|
4. `job-override.properties` (located in the project root dir)
|
||||||
5. `maven -Dparam=dedupId`
|
5. `maven -Dparam=value`
|
||||||
|
|
||||||
where the maven `-Dparam` property is overriding all the other ones.
|
where the maven `-Dparam` property is overriding all the other ones.
|
||||||
|
|
||||||
|
@ -73,7 +73,7 @@ Workflow definition requirements
|
||||||
|
|
||||||
This property can be set using maven `-D` switch.
|
This property can be set using maven `-D` switch.
|
||||||
|
|
||||||
`[oozie_app]` is the default directory name however it can be set to any dedupId as soon as `oozieAppDir` property is provided with directory name as dedupId.
|
`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value.
|
||||||
|
|
||||||
Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory.
|
Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory.
|
||||||
|
|
||||||
|
|
|
@ -73,7 +73,7 @@
|
||||||
<!-- This profile sets properties that are required for test oozie workflows To be used only with 'oozie-package' profile -->
|
<!-- This profile sets properties that are required for test oozie workflows To be used only with 'oozie-package' profile -->
|
||||||
<id>attach-test-resources</id>
|
<id>attach-test-resources</id>
|
||||||
<properties>
|
<properties>
|
||||||
<!--overriding default scope (set to 'runtime') with the 'test' dedupId. Test resources attached to oozie package requires all test dependencies. -->
|
<!--overriding default scope (set to 'runtime') with the 'test' value. Test resources attached to oozie package requires all test dependencies. -->
|
||||||
<oozie.package.dependencies.include.scope />
|
<oozie.package.dependencies.include.scope />
|
||||||
<oozie.package.dependencies.exclude.scope>provided</oozie.package.dependencies.exclude.scope>
|
<oozie.package.dependencies.exclude.scope>provided</oozie.package.dependencies.exclude.scope>
|
||||||
<!-- Do not skip creation of test jar for priming (in oozie-package profile) -->
|
<!-- Do not skip creation of test jar for priming (in oozie-package profile) -->
|
||||||
|
@ -326,7 +326,7 @@
|
||||||
</goals>
|
</goals>
|
||||||
<configuration>
|
<configuration>
|
||||||
<tasks>
|
<tasks>
|
||||||
<property name="assembly-resources.loc" dedupId="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
|
<property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
|
||||||
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
|
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
|
||||||
</tasks>
|
</tasks>
|
||||||
</configuration>
|
</configuration>
|
||||||
|
|
5
pom.xml
5
pom.xml
|
@ -236,6 +236,11 @@
|
||||||
<artifactId>java-jq</artifactId>
|
<artifactId>java-jq</artifactId>
|
||||||
<version>0.10.1</version>
|
<version>0.10.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>edu.cmu</groupId>
|
||||||
|
<artifactId>secondstring</artifactId>
|
||||||
|
<version>1.0.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.oozie</groupId>
|
<groupId>org.apache.oozie</groupId>
|
||||||
|
|
Loading…
Reference in New Issue