forked from D-Net/dnet-hadoop
merged from master
This commit is contained in:
commit
1cd6899480
|
@ -18,4 +18,5 @@
|
||||||
/*/build
|
/*/build
|
||||||
/build
|
/build
|
||||||
spark-warehouse
|
spark-warehouse
|
||||||
/dhp-workflows/dhp-graph-mapper/job-override.properties
|
/*/*/job-override.properties
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
<version>${maven.compiler.plugin.version}</version>
|
||||||
</plugin>
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
|
|
|
@ -53,6 +53,7 @@
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
<version>${maven.compiler.plugin.version}</version>
|
||||||
</plugin>
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
|
|
@ -17,6 +17,10 @@
|
||||||
<groupId>commons-cli</groupId>
|
<groupId>commons-cli</groupId>
|
||||||
<artifactId>commons-cli</artifactId>
|
<artifactId>commons-cli</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-io</groupId>
|
||||||
|
<artifactId>commons-io</artifactId>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
@ -29,21 +33,15 @@
|
||||||
<groupId>javax.persistence</groupId>
|
<groupId>javax.persistence</groupId>
|
||||||
<artifactId>javax.persistence-api</artifactId>
|
<artifactId>javax.persistence-api</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
|
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.rabbitmq</groupId>
|
<groupId>com.rabbitmq</groupId>
|
||||||
<artifactId>amqp-client</artifactId>
|
<artifactId>amqp-client</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>commons-io</groupId>
|
|
||||||
<artifactId>commons-io</artifactId>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -2,17 +2,25 @@ package eu.dnetlib.dhp.application;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import org.apache.commons.cli.*;
|
import org.apache.commons.cli.*;
|
||||||
|
import org.apache.commons.codec.binary.Base64;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Arrays;
|
import java.io.StringWriter;
|
||||||
import java.util.HashMap;
|
import java.util.*;
|
||||||
import java.util.Map;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
import java.util.zip.Inflater;
|
||||||
|
|
||||||
public class ArgumentApplicationParser implements Serializable {
|
public class ArgumentApplicationParser implements Serializable {
|
||||||
|
|
||||||
private final Options options = new Options();
|
private final Options options = new Options();
|
||||||
private final Map<String, String> objectMap = new HashMap<>();
|
private final Map<String, String> objectMap = new HashMap<>();
|
||||||
|
|
||||||
|
private final List<String> compressedValues = new ArrayList<>();
|
||||||
|
|
||||||
public ArgumentApplicationParser(final String json_configuration) throws Exception {
|
public ArgumentApplicationParser(final String json_configuration) throws Exception {
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
|
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
|
||||||
|
@ -29,6 +37,9 @@ public class ArgumentApplicationParser implements Serializable {
|
||||||
final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
|
final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
|
||||||
o.setLongOpt(conf.getParamLongName());
|
o.setLongOpt(conf.getParamLongName());
|
||||||
o.setRequired(conf.isParamRequired());
|
o.setRequired(conf.isParamRequired());
|
||||||
|
if (conf.isCompressed()) {
|
||||||
|
compressedValues.add(conf.getParamLongName());
|
||||||
|
}
|
||||||
return o;
|
return o;
|
||||||
}).forEach(options::addOption);
|
}).forEach(options::addOption);
|
||||||
|
|
||||||
|
@ -38,10 +49,32 @@ public class ArgumentApplicationParser implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static String decompressValue(final String abstractCompressed) {
|
||||||
|
try {
|
||||||
|
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
|
||||||
|
GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
|
||||||
|
final StringWriter stringWriter = new StringWriter();
|
||||||
|
IOUtils.copy(gis, stringWriter);
|
||||||
|
return stringWriter.toString();
|
||||||
|
} catch (Throwable e) {
|
||||||
|
System.out.println("Wrong value to decompress:" + abstractCompressed);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String compressArgument(final String value) throws Exception{
|
||||||
|
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||||
|
GZIPOutputStream gzip = new GZIPOutputStream(out);
|
||||||
|
gzip.write(value.getBytes());
|
||||||
|
gzip.close();
|
||||||
|
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
|
||||||
|
}
|
||||||
|
|
||||||
public void parseArgument(final String[] args) throws Exception {
|
public void parseArgument(final String[] args) throws Exception {
|
||||||
CommandLineParser parser = new BasicParser();
|
CommandLineParser parser = new BasicParser();
|
||||||
CommandLine cmd = parser.parse(options, args);
|
CommandLine cmd = parser.parse(options, args);
|
||||||
Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), it.getValue()));
|
Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), compressedValues.contains(it.getLongOpt())? decompressValue(it.getValue()): it.getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public String get(final String key) {
|
public String get(final String key) {
|
||||||
|
|
|
@ -7,6 +7,7 @@ public class OptionsParameter {
|
||||||
private String paramLongName;
|
private String paramLongName;
|
||||||
private String paramDescription;
|
private String paramDescription;
|
||||||
private boolean paramRequired;
|
private boolean paramRequired;
|
||||||
|
private boolean compressed;
|
||||||
|
|
||||||
public OptionsParameter() {
|
public OptionsParameter() {
|
||||||
}
|
}
|
||||||
|
@ -26,4 +27,12 @@ public class OptionsParameter {
|
||||||
public boolean isParamRequired() {
|
public boolean isParamRequired() {
|
||||||
return paramRequired;
|
return paramRequired;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isCompressed() {
|
||||||
|
return compressed;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCompressed(boolean compressed) {
|
||||||
|
this.compressed = compressed;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,10 @@ package eu.dnetlib.dhp.application;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.util.Base64;
|
||||||
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
|
||||||
|
@ -24,6 +28,7 @@ public class ArgumentApplicationParserTest {
|
||||||
"-ro", "value7",
|
"-ro", "value7",
|
||||||
"-rr", "value8",
|
"-rr", "value8",
|
||||||
"-w", "value9",
|
"-w", "value9",
|
||||||
|
"-cc", ArgumentApplicationParser.compressArgument(jsonConfiguration)
|
||||||
});
|
});
|
||||||
assertNotNull(parser.get("hdfsPath"));
|
assertNotNull(parser.get("hdfsPath"));
|
||||||
assertNotNull(parser.get("apidescriptor"));
|
assertNotNull(parser.get("apidescriptor"));
|
||||||
|
@ -45,7 +50,12 @@ public class ArgumentApplicationParserTest {
|
||||||
assertEquals("value7", parser.get("rabbitOngoingQueue"));
|
assertEquals("value7", parser.get("rabbitOngoingQueue"));
|
||||||
assertEquals("value8", parser.get("rabbitReportQueue"));
|
assertEquals("value8", parser.get("rabbitReportQueue"));
|
||||||
assertEquals("value9", parser.get("workflowId"));
|
assertEquals("value9", parser.get("workflowId"));
|
||||||
|
assertEquals(jsonConfiguration, parser.get("ccCoco"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
[
|
[
|
||||||
{"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true},
|
{"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true},
|
||||||
{"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true},
|
{"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true},
|
||||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true},
|
||||||
{"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true},
|
{"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true},
|
||||||
{"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true},
|
{"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true},
|
||||||
{"paramName":"rp", "paramLongName":"rabbitPassWord", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true},
|
{"paramName":"rp", "paramLongName":"rabbitPassWord", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true},
|
||||||
{"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true},
|
{"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true},
|
||||||
{"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true},
|
{"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true},
|
||||||
{"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true},
|
{"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true},
|
||||||
{"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true}
|
{"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true},
|
||||||
|
{"paramName":"cc", "paramLongName":"ccCoco", "paramDescription": "the identifier of the dnet Workflow", "compressed":true,"paramRequired": true}
|
||||||
]
|
]
|
|
@ -23,4 +23,23 @@ public class Context implements Serializable {
|
||||||
public void setDataInfo(List<DataInfo> dataInfo) {
|
public void setDataInfo(List<DataInfo> dataInfo) {
|
||||||
this.dataInfo = dataInfo;
|
this.dataInfo = dataInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return id ==null? 0 : id.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Context other = (Context) obj;
|
||||||
|
|
||||||
|
return id.equals(other.getId());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,7 @@ public class Dataset extends Result implements Serializable {
|
||||||
|
|
||||||
private List<GeoLocation> geolocation;
|
private List<GeoLocation> geolocation;
|
||||||
|
|
||||||
public Field<String> getStoragedate() {
|
public Field<String> getStoragedate() {
|
||||||
return storagedate;
|
return storagedate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,4 +74,26 @@ public class Dataset extends Result implements Serializable {
|
||||||
public void setGeolocation(List<GeoLocation> geolocation) {
|
public void setGeolocation(List<GeoLocation> geolocation) {
|
||||||
this.geolocation = geolocation;
|
this.geolocation = geolocation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
final Dataset d = (Dataset) e;
|
||||||
|
|
||||||
|
storagedate = d.getStoragedate() != null && compareTrust(this, e)<0? d.getStoragedate() : storagedate;
|
||||||
|
|
||||||
|
device= d.getDevice() != null && compareTrust(this, e)<0? d.getDevice() : device;
|
||||||
|
|
||||||
|
size= d.getSize() != null && compareTrust(this, e)<0? d.getSize() : size;
|
||||||
|
|
||||||
|
version= d.getVersion() != null && compareTrust(this, e)<0? d.getVersion() : version;
|
||||||
|
|
||||||
|
lastmetadataupdate= d.getLastmetadataupdate() != null && compareTrust(this, e)<0? d.getLastmetadataupdate() :lastmetadataupdate;
|
||||||
|
|
||||||
|
metadataversionnumber= d.getMetadataversionnumber() != null && compareTrust(this, e)<0? d.getMetadataversionnumber() : metadataversionnumber;
|
||||||
|
|
||||||
|
geolocation = mergeLists(geolocation, d.getGeolocation());
|
||||||
|
|
||||||
|
mergeOAFDataInfo(d);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -78,7 +78,7 @@ public class Datasource extends OafEntity implements Serializable {
|
||||||
|
|
||||||
private Field<String> certificates;
|
private Field<String> certificates;
|
||||||
|
|
||||||
private List< KeyValue> policies;
|
private List<KeyValue> policies;
|
||||||
|
|
||||||
private Journal journal;
|
private Journal journal;
|
||||||
|
|
||||||
|
@ -361,4 +361,67 @@ public class Datasource extends OafEntity implements Serializable {
|
||||||
public void setJournal(Journal journal) {
|
public void setJournal(Journal journal) {
|
||||||
this.journal = journal;
|
this.journal = journal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
|
||||||
|
Datasource d = (Datasource)e;
|
||||||
|
|
||||||
|
datasourcetype = d.getDatasourcetype() != null && compareTrust(this, e)<0? d.getDatasourcetype() : datasourcetype;
|
||||||
|
openairecompatibility = d.getOpenairecompatibility() != null && compareTrust(this, e)<0? d.getOpenairecompatibility() : openairecompatibility;
|
||||||
|
officialname = d.getOfficialname() != null && compareTrust(this, e)<0? d.getOfficialname() : officialname;
|
||||||
|
englishname = d.getEnglishname() != null && compareTrust(this, e)<0? d.getEnglishname() : officialname;
|
||||||
|
websiteurl = d.getWebsiteurl() != null && compareTrust(this, e)<0? d.getWebsiteurl() : websiteurl;
|
||||||
|
logourl = d.getLogourl() != null && compareTrust(this, e)<0? d.getLogourl() : getLogourl();
|
||||||
|
contactemail = d.getContactemail() != null && compareTrust(this, e)<0? d.getContactemail() : contactemail;
|
||||||
|
namespaceprefix = d.getNamespaceprefix() != null && compareTrust(this, e)<0? d.getNamespaceprefix() : namespaceprefix;
|
||||||
|
latitude = d.getLatitude() != null && compareTrust(this, e)<0? d.getLatitude() : latitude;
|
||||||
|
longitude = d.getLongitude() != null && compareTrust(this, e)<0? d.getLongitude() : longitude;
|
||||||
|
dateofvalidation = d.getDateofvalidation() != null && compareTrust(this, e)<0? d.getDateofvalidation() : dateofvalidation;
|
||||||
|
description = d.getDescription() != null && compareTrust(this, e)<0? d.getDescription() : description;
|
||||||
|
subjects = mergeLists(subjects, d.getSubjects());
|
||||||
|
|
||||||
|
// opendoar specific fields (od*)
|
||||||
|
odnumberofitems = d.getOdnumberofitems() != null && compareTrust(this, e)<0? d.getOdnumberofitems() : odnumberofitems;
|
||||||
|
odnumberofitemsdate = d.getOdnumberofitemsdate() != null && compareTrust(this, e)<0? d.getOdnumberofitemsdate() : odnumberofitemsdate;
|
||||||
|
odpolicies = d.getOdpolicies() != null && compareTrust(this, e)<0? d.getOdpolicies() : odpolicies;
|
||||||
|
odlanguages = mergeLists(odlanguages, d.getOdlanguages());
|
||||||
|
odcontenttypes = mergeLists(odcontenttypes, d.getOdcontenttypes());
|
||||||
|
accessinfopackage = mergeLists(accessinfopackage, d.getAccessinfopackage());
|
||||||
|
|
||||||
|
// re3data fields
|
||||||
|
releasestartdate = d.getReleasestartdate() != null && compareTrust(this, e)<0? d.getReleasestartdate() : releasestartdate;
|
||||||
|
releaseenddate = d.getReleaseenddate() != null && compareTrust(this, e)<0? d.getReleaseenddate() : releaseenddate;
|
||||||
|
missionstatementurl = d.getMissionstatementurl() != null && compareTrust(this, e)<0? d.getMissionstatementurl() : missionstatementurl;
|
||||||
|
dataprovider = d.getDataprovider() != null && compareTrust(this, e)<0? d.getDataprovider() : dataprovider;
|
||||||
|
serviceprovider = d.getServiceprovider() != null && compareTrust(this, e)<0? d.getServiceprovider() : serviceprovider;
|
||||||
|
|
||||||
|
// {open, restricted or closed}
|
||||||
|
databaseaccesstype = d.getDatabaseaccesstype() != null && compareTrust(this, e)<0? d.getDatabaseaccesstype() : databaseaccesstype;
|
||||||
|
|
||||||
|
// {open, restricted or closed}
|
||||||
|
datauploadtype = d.getDatauploadtype() != null && compareTrust(this, e)<0? d.getDatauploadtype() : datauploadtype;
|
||||||
|
|
||||||
|
// {feeRequired, registration, other}
|
||||||
|
databaseaccessrestriction = d.getDatabaseaccessrestriction() != null && compareTrust(this, e)<0? d.getDatabaseaccessrestriction() : databaseaccessrestriction;
|
||||||
|
|
||||||
|
// {feeRequired, registration, other}
|
||||||
|
datauploadrestriction = d.getDatauploadrestriction() != null && compareTrust(this, e)<0? d.getDatauploadrestriction() : datauploadrestriction;
|
||||||
|
|
||||||
|
versioning = d.getVersioning() != null && compareTrust(this, e)<0? d.getVersioning() : versioning;
|
||||||
|
citationguidelineurl = d.getCitationguidelineurl() != null && compareTrust(this, e)<0? d.getCitationguidelineurl() : citationguidelineurl;
|
||||||
|
|
||||||
|
//{yes, no, unknown}
|
||||||
|
qualitymanagementkind = d.getQualitymanagementkind() != null && compareTrust(this, e)<0? d.getQualitymanagementkind() : qualitymanagementkind;
|
||||||
|
pidsystems = d.getPidsystems() != null && compareTrust(this, e)<0? d.getPidsystems() : pidsystems;
|
||||||
|
|
||||||
|
certificates = d.getCertificates() != null && compareTrust(this, e)<0? d.getCertificates() : certificates;
|
||||||
|
|
||||||
|
policies = mergeLists(policies, d.getPolicies());
|
||||||
|
|
||||||
|
journal = d.getJournal() != null && compareTrust(this, e)<0? d.getJournal() : journal;
|
||||||
|
|
||||||
|
mergeOAFDataInfo(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,4 +23,21 @@ public class Field<T> implements Serializable {
|
||||||
public void setDataInfo(DataInfo dataInfo) {
|
public void setDataInfo(DataInfo dataInfo) {
|
||||||
this.dataInfo = dataInfo;
|
this.dataInfo = dataInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return getValue() == null ? 0 : getValue().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
Field<T> other = (Field<T>) obj;
|
||||||
|
return getValue().equals(other.getValue());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
public class GeoLocation implements Serializable {
|
public class GeoLocation implements Serializable {
|
||||||
|
@ -33,4 +35,35 @@ public class GeoLocation implements Serializable {
|
||||||
public void setPlace(String place) {
|
public void setPlace(String place) {
|
||||||
this.place = place;
|
this.place = place;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isBlank() {
|
||||||
|
return StringUtils.isBlank(point) &&
|
||||||
|
StringUtils.isBlank(box) &&
|
||||||
|
StringUtils.isBlank(place);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toComparableString() {
|
||||||
|
return isBlank()?"":String.format("%s::%s%s", point != null ? point.toLowerCase() : "", box != null ? box.toLowerCase() : "", place != null ? place.toLowerCase() : "");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
GeoLocation other = (GeoLocation) obj;
|
||||||
|
|
||||||
|
return toComparableString()
|
||||||
|
.equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -85,4 +85,34 @@ public class Instance implements Serializable {
|
||||||
public void setDateofacceptance(Field<String> dateofacceptance) {
|
public void setDateofacceptance(Field<String> dateofacceptance) {
|
||||||
this.dateofacceptance = dateofacceptance;
|
this.dateofacceptance = dateofacceptance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public String toComparableString(){
|
||||||
|
return String.format("%s::%s::%s::%s",
|
||||||
|
hostedby != null && hostedby.getKey()!= null ? hostedby.getKey().toLowerCase() : "",
|
||||||
|
accessright!= null && accessright.getClassid()!= null ? accessright.getClassid() : "",
|
||||||
|
instancetype!= null && instancetype.getClassid()!= null ? instancetype.getClassid() : "",
|
||||||
|
url != null ? url:"");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Instance other = (Instance) obj;
|
||||||
|
|
||||||
|
return toComparableString()
|
||||||
|
.equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
public class KeyValue implements Serializable {
|
public class KeyValue implements Serializable {
|
||||||
|
@ -33,4 +35,31 @@ public class KeyValue implements Serializable {
|
||||||
public void setDataInfo(DataInfo dataInfo) {
|
public void setDataInfo(DataInfo dataInfo) {
|
||||||
this.dataInfo = dataInfo;
|
this.dataInfo = dataInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toComparableString() {
|
||||||
|
return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBlank() {
|
||||||
|
return StringUtils.isBlank(key) && StringUtils.isBlank(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
KeyValue other = (KeyValue) obj;
|
||||||
|
|
||||||
|
return toComparableString().equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
public abstract class Oaf implements Serializable {
|
public abstract class Oaf implements Serializable {
|
||||||
|
@ -27,13 +24,23 @@ public abstract class Oaf implements Serializable {
|
||||||
this.lastupdatetimestamp = lastupdatetimestamp;
|
this.lastupdatetimestamp = lastupdatetimestamp;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
public void mergeOAFDataInfo(Oaf e) {
|
||||||
try {
|
if (e.getDataInfo()!= null && compareTrust(this,e)<0)
|
||||||
return new ObjectMapper().writeValueAsString(this);
|
dataInfo = e.getDataInfo();
|
||||||
} catch (JsonProcessingException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected String extractTrust(Oaf e) {
|
||||||
|
if (e == null || e.getDataInfo()== null || e.getDataInfo().getTrust()== null)
|
||||||
|
return "0.0";
|
||||||
|
return e.getDataInfo().getTrust();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected int compareTrust(Oaf a, Oaf b) {
|
||||||
|
return extractTrust(a).compareTo(extractTrust(b));
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public abstract class OafEntity extends Oaf implements Serializable {
|
public abstract class OafEntity extends Oaf implements Serializable {
|
||||||
|
|
||||||
|
@ -84,4 +85,36 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
||||||
public void setOaiprovenance(OAIProvenance oaiprovenance) {
|
public void setOaiprovenance(OAIProvenance oaiprovenance) {
|
||||||
this.oaiprovenance = oaiprovenance;
|
this.oaiprovenance = oaiprovenance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
|
||||||
|
if (e == null)
|
||||||
|
return;
|
||||||
|
|
||||||
|
originalId = mergeLists(originalId, e.getOriginalId());
|
||||||
|
|
||||||
|
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
|
||||||
|
|
||||||
|
pid = mergeLists(pid, e.getPid());
|
||||||
|
|
||||||
|
if (e.getDateofcollection() != null && compareTrust(this, e) < 0)
|
||||||
|
dateofcollection = e.getDateofcollection();
|
||||||
|
|
||||||
|
if (e.getDateoftransformation() != null && compareTrust(this, e) < 0)
|
||||||
|
dateoftransformation = e.getDateoftransformation();
|
||||||
|
|
||||||
|
extraInfo = mergeLists(extraInfo, e.getExtraInfo());
|
||||||
|
|
||||||
|
if (e.getOaiprovenance() != null && compareTrust(this, e) < 0)
|
||||||
|
oaiprovenance = e.getOaiprovenance();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected <T> List<T> mergeLists(final List<T>... lists) {
|
||||||
|
|
||||||
|
return Arrays.stream(lists).filter(Objects::nonNull).flatMap(List::stream).distinct().collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -164,4 +164,28 @@ public class Organization extends OafEntity implements Serializable {
|
||||||
public void setCountry(Qualifier country) {
|
public void setCountry(Qualifier country) {
|
||||||
this.country = country;
|
this.country = country;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
final Organization o = (Organization) e;
|
||||||
|
legalshortname = o.getLegalshortname() != null && compareTrust(this, e)<0? o.getLegalshortname() : legalshortname;
|
||||||
|
legalname = o.getLegalname() != null && compareTrust(this, e)<0 ? o.getLegalname() : legalname;
|
||||||
|
alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames);
|
||||||
|
websiteurl = o.getWebsiteurl() != null && compareTrust(this, e)<0? o.getWebsiteurl() : websiteurl;
|
||||||
|
logourl = o.getLogourl() != null && compareTrust(this, e)<0? o.getLogourl() : logourl;
|
||||||
|
eclegalbody = o.getEclegalbody() != null && compareTrust(this, e)<0? o.getEclegalbody() : eclegalbody;
|
||||||
|
eclegalperson = o.getEclegalperson() != null && compareTrust(this, e)<0? o.getEclegalperson() : eclegalperson;
|
||||||
|
ecnonprofit = o.getEcnonprofit() != null && compareTrust(this, e)<0? o.getEcnonprofit() : ecnonprofit;
|
||||||
|
ecresearchorganization = o.getEcresearchorganization() != null && compareTrust(this, e)<0? o.getEcresearchorganization() : ecresearchorganization;
|
||||||
|
echighereducation = o.getEchighereducation() != null && compareTrust(this, e)<0? o.getEchighereducation() : echighereducation;
|
||||||
|
ecinternationalorganizationeurinterests = o.getEcinternationalorganizationeurinterests() != null && compareTrust(this, e)<0? o.getEcinternationalorganizationeurinterests() : ecinternationalorganizationeurinterests;
|
||||||
|
ecinternationalorganization = o.getEcinternationalorganization() != null && compareTrust(this, e)<0? o.getEcinternationalorganization() : ecinternationalorganization;
|
||||||
|
ecenterprise = o.getEcenterprise() != null && compareTrust(this, e)<0? o.getEcenterprise() :ecenterprise;
|
||||||
|
ecsmevalidated = o.getEcsmevalidated() != null && compareTrust(this, e)<0? o.getEcsmevalidated() :ecsmevalidated;
|
||||||
|
ecnutscode = o.getEcnutscode() != null && compareTrust(this, e)<0? o.getEcnutscode() :ecnutscode;
|
||||||
|
country = o.getCountry() != null && compareTrust(this, e)<0 ? o.getCountry() :country;
|
||||||
|
mergeOAFDataInfo(o);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,4 +34,16 @@ public class OtherResearchProduct extends Result implements Serializable {
|
||||||
public void setTool(List<Field<String>> tool) {
|
public void setTool(List<Field<String>> tool) {
|
||||||
this.tool = tool;
|
this.tool = tool;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
|
||||||
|
OtherResearchProduct o = (OtherResearchProduct)e;
|
||||||
|
|
||||||
|
contactperson = mergeLists(contactperson, o.getContactperson());
|
||||||
|
contactgroup = mergeLists(contactgroup, o.getContactgroup());
|
||||||
|
tool = mergeLists(tool, o.getTool());
|
||||||
|
mergeOAFDataInfo(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -264,4 +264,39 @@ public class Project extends OafEntity implements Serializable {
|
||||||
public void setFundedamount(Float fundedamount) {
|
public void setFundedamount(Float fundedamount) {
|
||||||
this.fundedamount = fundedamount;
|
this.fundedamount = fundedamount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
Project p = (Project)e;
|
||||||
|
|
||||||
|
websiteurl= p.getWebsiteurl()!= null && compareTrust(this,e)<0?p.getWebsiteurl():websiteurl;
|
||||||
|
code= p.getCode()!=null && compareTrust(this,e)<0?p.getCode():code;
|
||||||
|
acronym= p.getAcronym()!= null && compareTrust(this,e)<0?p.getAcronym():acronym;
|
||||||
|
title= p.getTitle()!= null && compareTrust(this,e)<0?p.getTitle():title;
|
||||||
|
startdate= p.getStartdate()!=null && compareTrust(this,e)<0?p.getStartdate():startdate;
|
||||||
|
enddate= p.getEnddate()!=null && compareTrust(this,e)<0?p.getEnddate():enddate;
|
||||||
|
callidentifier= p.getCallidentifier()!=null && compareTrust(this,e)<0?p.getCallidentifier():callidentifier;
|
||||||
|
keywords= p.getKeywords()!=null && compareTrust(this,e)<0?p.getKeywords():keywords;
|
||||||
|
duration= p.getDuration()!=null && compareTrust(this,e)<0?p.getDuration():duration;
|
||||||
|
ecsc39= p.getEcsc39()!=null && compareTrust(this,e)<0?p.getEcsc39():ecsc39;
|
||||||
|
oamandatepublications= p.getOamandatepublications()!=null && compareTrust(this,e)<0?p.getOamandatepublications():oamandatepublications;
|
||||||
|
ecarticle29_3= p.getEcarticle29_3()!=null && compareTrust(this,e)<0?p.getEcarticle29_3():ecarticle29_3;
|
||||||
|
subjects= mergeLists(subjects, p.getSubjects());
|
||||||
|
fundingtree= mergeLists(fundingtree, p.getFundingtree());
|
||||||
|
contracttype= p.getContracttype()!=null && compareTrust(this,e)<0?p.getContracttype():contracttype;
|
||||||
|
optional1= p.getOptional1()!=null && compareTrust(this,e)<0?p.getOptional1():optional1;
|
||||||
|
optional2= p.getOptional2()!=null && compareTrust(this,e)<0?p.getOptional2():optional2;
|
||||||
|
jsonextrainfo= p.getJsonextrainfo()!=null && compareTrust(this,e)<0?p.getJsonextrainfo():jsonextrainfo;
|
||||||
|
contactfullname= p.getContactfullname()!=null && compareTrust(this,e)<0?p.getContactfullname():contactfullname;
|
||||||
|
contactfax= p.getContactfax()!=null && compareTrust(this,e)<0?p.getContactfax():contactfax;
|
||||||
|
contactphone= p.getContactphone()!=null && compareTrust(this,e)<0?p.getContactphone():contactphone;
|
||||||
|
contactemail= p.getContactemail()!=null && compareTrust(this,e)<0?p.getContactemail():contactemail;
|
||||||
|
summary= p.getSummary()!=null && compareTrust(this,e)<0?p.getSummary():summary;
|
||||||
|
currency= p.getCurrency()!=null && compareTrust(this,e)<0?p.getCurrency():currency;
|
||||||
|
totalcost= p.getTotalcost()!=null && compareTrust(this,e)<0?p.getTotalcost():totalcost;
|
||||||
|
fundedamount= p.getFundedamount()!= null && compareTrust(this,e)<0?p.getFundedamount():fundedamount;
|
||||||
|
mergeOAFDataInfo(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,4 +14,17 @@ public class Publication extends Result implements Serializable {
|
||||||
public void setJournal(Journal journal) {
|
public void setJournal(Journal journal) {
|
||||||
this.journal = journal;
|
this.journal = journal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
|
||||||
|
Publication p = (Publication) e;
|
||||||
|
|
||||||
|
if (p.getJournal() != null && compareTrust(this, e)<0)
|
||||||
|
journal = p.getJournal();
|
||||||
|
mergeOAFDataInfo(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
public class Qualifier implements Serializable {
|
public class Qualifier implements Serializable {
|
||||||
|
@ -40,4 +42,37 @@ public class Qualifier implements Serializable {
|
||||||
public void setSchemename(String schemename) {
|
public void setSchemename(String schemename) {
|
||||||
this.schemename = schemename;
|
this.schemename = schemename;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toComparableString() {
|
||||||
|
return isBlank()?"": String.format("%s::%s::%s::%s",
|
||||||
|
classid != null ? classid : "",
|
||||||
|
classname != null ? classname : "",
|
||||||
|
schemeid != null ? schemeid : "",
|
||||||
|
schemename != null ? schemename : "");
|
||||||
|
}
|
||||||
|
public boolean isBlank() {
|
||||||
|
return StringUtils.isBlank(classid) &&
|
||||||
|
StringUtils.isBlank(classname) &&
|
||||||
|
StringUtils.isBlank(schemeid) &&
|
||||||
|
StringUtils.isBlank(schemename);
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Qualifier other = (Qualifier) obj;
|
||||||
|
|
||||||
|
return toComparableString()
|
||||||
|
.equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public abstract class Result extends OafEntity implements Serializable {
|
public abstract class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
|
@ -12,35 +15,35 @@ public abstract class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
// common fields
|
// common fields
|
||||||
private Qualifier language;
|
private Qualifier language;
|
||||||
|
|
||||||
private List<Qualifier> country;
|
private List<Qualifier> country;
|
||||||
|
|
||||||
private List<StructuredProperty> subject;
|
private List<StructuredProperty> subject;
|
||||||
|
|
||||||
private List<StructuredProperty> title;
|
private List<StructuredProperty> title;
|
||||||
|
|
||||||
private List<StructuredProperty> relevantdate;
|
private List<StructuredProperty> relevantdate;
|
||||||
|
|
||||||
private List<Field<String>> description;
|
private List<Field<String>> description;
|
||||||
|
|
||||||
private Field<String> dateofacceptance;
|
private Field<String> dateofacceptance;
|
||||||
|
|
||||||
private Field<String> publisher;
|
private Field<String> publisher;
|
||||||
|
|
||||||
private Field<String> embargoenddate;
|
private Field<String> embargoenddate;
|
||||||
|
|
||||||
private List<Field<String>> source;
|
private List<Field<String>> source;
|
||||||
|
|
||||||
private List<Field<String>> fulltext; // remove candidate
|
private List<Field<String>> fulltext; // remove candidate
|
||||||
|
|
||||||
private List<Field<String>> format;
|
private List<Field<String>> format;
|
||||||
|
|
||||||
private List<Field<String>> contributor;
|
private List<Field<String>> contributor;
|
||||||
|
|
||||||
private Qualifier resourcetype;
|
private Qualifier resourcetype;
|
||||||
|
|
||||||
private List<Field<String>> coverage;
|
private List<Field<String>> coverage;
|
||||||
|
|
||||||
private Field<String> refereed; //peer-review status
|
private Field<String> refereed; //peer-review status
|
||||||
|
|
||||||
private List<Context> context;
|
private List<Context> context;
|
||||||
|
@ -240,4 +243,76 @@ public abstract class Result extends OafEntity implements Serializable {
|
||||||
this.processingchargecurrency = processingchargecurrency;
|
this.processingchargecurrency = processingchargecurrency;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
|
||||||
|
Result r = (Result) e;
|
||||||
|
|
||||||
|
instance = mergeLists(instance, r.getInstance());
|
||||||
|
|
||||||
|
if (r.getResulttype() != null && compareTrust(this, r) < 0)
|
||||||
|
resulttype = r.getResulttype();
|
||||||
|
|
||||||
|
if (r.getLanguage() != null && compareTrust(this, r) < 0)
|
||||||
|
language = r.getLanguage();
|
||||||
|
|
||||||
|
country = mergeLists(country, r.getCountry());
|
||||||
|
|
||||||
|
subject = mergeLists(subject, r.getSubject());
|
||||||
|
|
||||||
|
title = mergeLists(title, r.getTitle());
|
||||||
|
|
||||||
|
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
|
||||||
|
|
||||||
|
description = longestLists(description, r.getDescription());
|
||||||
|
|
||||||
|
if (r.getPublisher() != null && compareTrust(this, r) < 0)
|
||||||
|
publisher = r.getPublisher();
|
||||||
|
|
||||||
|
if (r.getEmbargoenddate() != null && compareTrust(this, r) < 0)
|
||||||
|
embargoenddate = r.getEmbargoenddate();
|
||||||
|
|
||||||
|
source = mergeLists(source, r.getSource());
|
||||||
|
|
||||||
|
fulltext = mergeLists(fulltext, r.getFulltext());
|
||||||
|
|
||||||
|
format = mergeLists(format, r.getFormat());
|
||||||
|
|
||||||
|
contributor = mergeLists(contributor, r.getContributor());
|
||||||
|
|
||||||
|
if (r.getResourcetype() != null)
|
||||||
|
resourcetype = r.getResourcetype();
|
||||||
|
|
||||||
|
coverage = mergeLists(coverage, r.getCoverage());
|
||||||
|
|
||||||
|
if (r.getRefereed() != null && compareTrust(this, r) < 0)
|
||||||
|
refereed = r.getRefereed();
|
||||||
|
|
||||||
|
context = mergeLists(context, r.getContext());
|
||||||
|
|
||||||
|
if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0)
|
||||||
|
processingchargeamount = r.getProcessingchargeamount();
|
||||||
|
|
||||||
|
if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0)
|
||||||
|
processingchargecurrency = r.getProcessingchargecurrency();
|
||||||
|
|
||||||
|
externalReference = mergeLists(externalReference, r.getExternalReference());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Field<String>> longestLists(List<Field<String>> a, List<Field<String>> b) {
|
||||||
|
if (a == null || b == null)
|
||||||
|
return a == null ? b : a;
|
||||||
|
if (a.size() == b.size()) {
|
||||||
|
int msa = a.stream().filter(i -> i.getValue() != null).map(i -> i.getValue().length()).max(Comparator.naturalOrder()).orElse(0);
|
||||||
|
int msb = b.stream().filter(i -> i.getValue() != null).map(i -> i.getValue().length()).max(Comparator.naturalOrder()).orElse(0);
|
||||||
|
return msa > msb ? a : b;
|
||||||
|
}
|
||||||
|
return a.size() > b.size() ? a : b;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,4 +44,19 @@ public class Software extends Result implements Serializable {
|
||||||
public void setProgrammingLanguage(Qualifier programmingLanguage) {
|
public void setProgrammingLanguage(Qualifier programmingLanguage) {
|
||||||
this.programmingLanguage = programmingLanguage;
|
this.programmingLanguage = programmingLanguage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
final Software s = (Software) e;
|
||||||
|
documentationUrl = mergeLists(documentationUrl, s.getDocumentationUrl());
|
||||||
|
|
||||||
|
license = mergeLists(license, s.getLicense());
|
||||||
|
|
||||||
|
codeRepositoryUrl = s.getCodeRepositoryUrl()!= null && compareTrust(this, s)<0?s.getCodeRepositoryUrl():codeRepositoryUrl;
|
||||||
|
|
||||||
|
programmingLanguage= s.getProgrammingLanguage()!= null && compareTrust(this, s)<0?s.getProgrammingLanguage():programmingLanguage;
|
||||||
|
|
||||||
|
mergeOAFDataInfo(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,4 +33,28 @@ public class StructuredProperty implements Serializable {
|
||||||
public void setDataInfo(DataInfo dataInfo) {
|
public void setDataInfo(DataInfo dataInfo) {
|
||||||
this.dataInfo = dataInfo;
|
this.dataInfo = dataInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toComparableString(){
|
||||||
|
return value != null ? value.toLowerCase() : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
StructuredProperty other = (StructuredProperty) obj;
|
||||||
|
|
||||||
|
return toComparableString()
|
||||||
|
.equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,89 @@
|
||||||
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class MergeTest {
|
||||||
|
|
||||||
|
OafEntity oaf;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() {
|
||||||
|
oaf = new Publication();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergeListsTest() {
|
||||||
|
|
||||||
|
//string list merge test
|
||||||
|
List<String> a = Arrays.asList("a", "b", "c", "e");
|
||||||
|
List<String> b = Arrays.asList("a", "b", "c", "d");
|
||||||
|
List<String> c = null;
|
||||||
|
|
||||||
|
System.out.println("merge result 1 = " + oaf.mergeLists(a, b));
|
||||||
|
|
||||||
|
System.out.println("merge result 2 = " + oaf.mergeLists(a, c));
|
||||||
|
|
||||||
|
System.out.println("merge result 3 = " + oaf.mergeLists(c, c));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergePublicationCollectedFromTest() {
|
||||||
|
|
||||||
|
Publication a = new Publication();
|
||||||
|
Publication b = new Publication();
|
||||||
|
|
||||||
|
a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed")));
|
||||||
|
b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open")));
|
||||||
|
|
||||||
|
a.mergeFrom(b);
|
||||||
|
|
||||||
|
Assert.assertNotNull(a.getCollectedfrom());
|
||||||
|
Assert.assertEquals(3, a.getCollectedfrom().size());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergePublicationSubjectTest() {
|
||||||
|
|
||||||
|
Publication a = new Publication();
|
||||||
|
Publication b = new Publication();
|
||||||
|
|
||||||
|
a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe")));
|
||||||
|
b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe")));
|
||||||
|
|
||||||
|
a.mergeFrom(b);
|
||||||
|
|
||||||
|
Assert.assertNotNull(a.getSubject());
|
||||||
|
Assert.assertEquals(3, a.getSubject().size());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private KeyValue setKV(final String key, final String value) {
|
||||||
|
|
||||||
|
KeyValue k = new KeyValue();
|
||||||
|
|
||||||
|
k.setKey(key);
|
||||||
|
k.setValue(value);
|
||||||
|
|
||||||
|
return k;
|
||||||
|
}
|
||||||
|
|
||||||
|
private StructuredProperty setSP(final String value, final String schema, final String classname) {
|
||||||
|
StructuredProperty s = new StructuredProperty();
|
||||||
|
s.setValue(value);
|
||||||
|
Qualifier q = new Qualifier();
|
||||||
|
q.setClassname(classname);
|
||||||
|
q.setClassid(classname);
|
||||||
|
q.setSchemename(schema);
|
||||||
|
q.setSchemeid(schema);
|
||||||
|
s.setQualifier(q);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
|
@ -7,6 +7,8 @@
|
||||||
<version>1.0.5-SNAPSHOT</version>
|
<version>1.0.5-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>dhp-aggregation</artifactId>
|
<artifactId>dhp-aggregation</artifactId>
|
||||||
|
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -89,6 +89,8 @@ public class TransformationJobTest {
|
||||||
"-rh", "",
|
"-rh", "",
|
||||||
"-ro", "",
|
"-ro", "",
|
||||||
"-rr", ""});
|
"-rr", ""});
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -96,7 +98,7 @@ public class TransformationJobTest {
|
||||||
final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
|
final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
|
||||||
System.out.println("path = " + path);
|
System.out.println("path = " + path);
|
||||||
|
|
||||||
Path tempDirWithPrefix = Files.createTempDirectory("mdsotre_output");
|
Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output");
|
||||||
|
|
||||||
System.out.println(tempDirWithPrefix.toFile().getAbsolutePath());
|
System.out.println(tempDirWithPrefix.toFile().getAbsolutePath());
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>dhp-workflows</artifactId>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<version>1.0.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>dhp-dedup</artifactId>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-sql_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-common</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-schemas</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.arakelian</groupId>
|
||||||
|
<artifactId>java-jq</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>dnet-pace-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-graphx_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-databind</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,119 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
import java.time.Year;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import static java.util.Collections.reverseOrder;
|
||||||
|
import static java.util.Map.Entry.comparingByValue;
|
||||||
|
import static java.util.stream.Collectors.toMap;
|
||||||
|
import static org.apache.commons.lang.StringUtils.endsWith;
|
||||||
|
import static org.apache.commons.lang.StringUtils.substringBefore;
|
||||||
|
|
||||||
|
public class DatePicker {
|
||||||
|
|
||||||
|
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
|
||||||
|
private static final String DATE_DEFAULT_SUFFIX = "01-01";
|
||||||
|
private static final int YEAR_LB = 1300;
|
||||||
|
private static final int YEAR_UB = Year.now().getValue() + 5;
|
||||||
|
|
||||||
|
public static Field<String> pick(final Collection<String> dateofacceptance) {
|
||||||
|
|
||||||
|
final Map<String, Integer> frequencies = dateofacceptance
|
||||||
|
.parallelStream()
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.collect(
|
||||||
|
Collectors.toConcurrentMap(
|
||||||
|
w -> w, w -> 1, Integer::sum));
|
||||||
|
|
||||||
|
if (frequencies.isEmpty()) {
|
||||||
|
return new Field<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
final Field<String> date = new Field<>();
|
||||||
|
date.setValue(frequencies.keySet().iterator().next());
|
||||||
|
|
||||||
|
// let's sort this map by values first, filtering out invalid dates
|
||||||
|
final Map<String, Integer> sorted = frequencies
|
||||||
|
.entrySet()
|
||||||
|
.stream()
|
||||||
|
.filter(d -> StringUtils.isNotBlank(d.getKey()))
|
||||||
|
.filter(d -> d.getKey().matches(DATE_PATTERN))
|
||||||
|
.filter(d -> inRange(d.getKey()))
|
||||||
|
.sorted(reverseOrder(comparingByValue()))
|
||||||
|
.collect(
|
||||||
|
toMap(
|
||||||
|
Map.Entry::getKey,
|
||||||
|
Map.Entry::getValue, (e1, e2) -> e2,
|
||||||
|
LinkedHashMap::new));
|
||||||
|
|
||||||
|
// shortcut
|
||||||
|
if (sorted.size() == 0) {
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
// voting method (1/3 + 1) wins
|
||||||
|
if (sorted.size() >= 3) {
|
||||||
|
final int acceptThreshold = (sorted.size() / 3) + 1;
|
||||||
|
final List<String> accepted = sorted.entrySet().stream()
|
||||||
|
.filter(e -> e.getValue() >= acceptThreshold)
|
||||||
|
.map(e -> e.getKey())
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
// cannot find strong majority
|
||||||
|
if (accepted.isEmpty()) {
|
||||||
|
final int max = sorted.values().iterator().next();
|
||||||
|
Optional<String> first = sorted.entrySet().stream()
|
||||||
|
.filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.findFirst();
|
||||||
|
if (first.isPresent()) {
|
||||||
|
date.setValue(first.get());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
date.setValue(sorted.keySet().iterator().next());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (accepted.size() == 1) {
|
||||||
|
date.setValue(accepted.get(0));
|
||||||
|
return date;
|
||||||
|
} else {
|
||||||
|
final Optional<String> first = accepted.stream()
|
||||||
|
.filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX))
|
||||||
|
.findFirst();
|
||||||
|
if (first.isPresent()) {
|
||||||
|
date.setValue(first.get());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
//1st non YYYY-01-01 is returned
|
||||||
|
} else {
|
||||||
|
if (sorted.size() == 2) {
|
||||||
|
for (Map.Entry<String, Integer> e : sorted.entrySet()) {
|
||||||
|
if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
|
||||||
|
date.setValue(e.getKey());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// none of the dates seems good enough, return the 1st one
|
||||||
|
date.setValue(sorted.keySet().iterator().next());
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean inRange(final String date) {
|
||||||
|
final int year = Integer.parseInt(substringBefore(date, "-"));
|
||||||
|
return year >= YEAR_LB && year <= YEAR_UB;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,279 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import org.apache.commons.lang.NotImplementedException;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.toMap;
|
||||||
|
|
||||||
|
public class DedupRecordFactory {
|
||||||
|
|
||||||
|
public static JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf) {
|
||||||
|
long ts = System.currentTimeMillis();
|
||||||
|
//<id, json_entity>
|
||||||
|
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
|
||||||
|
.mapToPair((PairFunction<String, String, String>) it ->
|
||||||
|
new Tuple2<String, String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)
|
||||||
|
);
|
||||||
|
|
||||||
|
//<source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||||
|
JavaPairRDD<String, String> mergeRels = spark
|
||||||
|
.read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class))
|
||||||
|
.where("relClass=='merges'")
|
||||||
|
.javaRDD()
|
||||||
|
.mapToPair(
|
||||||
|
(PairFunction<Relation, String, String>) r ->
|
||||||
|
new Tuple2<String, String>(r.getTarget(), r.getSource())
|
||||||
|
);
|
||||||
|
|
||||||
|
//<dedup_id, json_entity_merged>
|
||||||
|
final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||||
|
|
||||||
|
JavaPairRDD<String, Iterable<String>> sortedJoinResult = joinResult.groupByKey();
|
||||||
|
|
||||||
|
switch (entityType) {
|
||||||
|
case publication:
|
||||||
|
return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts));
|
||||||
|
case dataset:
|
||||||
|
return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts));
|
||||||
|
case project:
|
||||||
|
return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts));
|
||||||
|
case software:
|
||||||
|
return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts));
|
||||||
|
case datasource:
|
||||||
|
return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts));
|
||||||
|
case organization:
|
||||||
|
return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts));
|
||||||
|
case otherresearchproduct:
|
||||||
|
return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts));
|
||||||
|
default:
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Publication publicationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
|
Publication p = new Publication(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
p.setId(e._1());
|
||||||
|
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||||
|
|
||||||
|
if (e._2() != null)
|
||||||
|
e._2().forEach(pub -> {
|
||||||
|
try {
|
||||||
|
Publication publication = mapper.readValue(pub, Publication.class);
|
||||||
|
|
||||||
|
p.mergeFrom(publication);
|
||||||
|
p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
|
||||||
|
//add to the list if they are not null
|
||||||
|
if (publication.getDateofacceptance() != null)
|
||||||
|
dateofacceptance.add(publication.getDateofacceptance().getValue());
|
||||||
|
} catch (Exception exc) {
|
||||||
|
throw new RuntimeException(exc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
p.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||||
|
if (p.getDataInfo() == null)
|
||||||
|
p.setDataInfo(new DataInfo());
|
||||||
|
p.getDataInfo().setTrust("0.9");
|
||||||
|
p.setLastupdatetimestamp(ts);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dataset datasetMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
|
Dataset d = new Dataset(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
d.setId(e._1());
|
||||||
|
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||||
|
|
||||||
|
if (e._2() != null)
|
||||||
|
e._2().forEach(dat -> {
|
||||||
|
try {
|
||||||
|
Dataset dataset = mapper.readValue(dat, Dataset.class);
|
||||||
|
|
||||||
|
d.mergeFrom(dataset);
|
||||||
|
d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor()));
|
||||||
|
//add to the list if they are not null
|
||||||
|
if (dataset.getDateofacceptance() != null)
|
||||||
|
dateofacceptance.add(dataset.getDateofacceptance().getValue());
|
||||||
|
} catch (Exception exc) {
|
||||||
|
throw new RuntimeException(exc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
d.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||||
|
if (d.getDataInfo() == null)
|
||||||
|
d.setDataInfo(new DataInfo());
|
||||||
|
d.getDataInfo().setTrust("0.9");
|
||||||
|
d.setLastupdatetimestamp(ts);
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Project projectMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
|
Project p = new Project(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
p.setId(e._1());
|
||||||
|
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
if (e._2() != null)
|
||||||
|
e._2().forEach(proj -> {
|
||||||
|
try {
|
||||||
|
Project project = mapper.readValue(proj, Project.class);
|
||||||
|
|
||||||
|
p.mergeFrom(project);
|
||||||
|
} catch (Exception exc) {
|
||||||
|
throw new RuntimeException(exc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (p.getDataInfo() == null)
|
||||||
|
p.setDataInfo(new DataInfo());
|
||||||
|
p.getDataInfo().setTrust("0.9");
|
||||||
|
p.setLastupdatetimestamp(ts);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Software softwareMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
|
Software s = new Software(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
s.setId(e._1());
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||||
|
if (e._2() != null)
|
||||||
|
e._2().forEach(soft -> {
|
||||||
|
try {
|
||||||
|
Software software = mapper.readValue(soft, Software.class);
|
||||||
|
|
||||||
|
s.mergeFrom(software);
|
||||||
|
s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor()));
|
||||||
|
//add to the list if they are not null
|
||||||
|
if (software.getDateofacceptance() != null)
|
||||||
|
dateofacceptance.add(software.getDateofacceptance().getValue());
|
||||||
|
} catch (Exception exc) {
|
||||||
|
throw new RuntimeException(exc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
s.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||||
|
if (s.getDataInfo() == null)
|
||||||
|
s.setDataInfo(new DataInfo());
|
||||||
|
s.getDataInfo().setTrust("0.9");
|
||||||
|
s.setLastupdatetimestamp(ts);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
Datasource d = new Datasource(); //the result of the merge, to be returned at the end
|
||||||
|
d.setId(e._1());
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
if (e._2() != null)
|
||||||
|
e._2().forEach(dat -> {
|
||||||
|
try {
|
||||||
|
Datasource datasource = mapper.readValue(dat, Datasource.class);
|
||||||
|
|
||||||
|
d.mergeFrom(datasource);
|
||||||
|
} catch (Exception exc) {
|
||||||
|
throw new RuntimeException(exc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (d.getDataInfo() == null)
|
||||||
|
d.setDataInfo(new DataInfo());
|
||||||
|
d.getDataInfo().setTrust("0.9");
|
||||||
|
d.setLastupdatetimestamp(ts);
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Organization organizationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
|
Organization o = new Organization(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
o.setId(e._1());
|
||||||
|
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
|
||||||
|
StringBuilder trust = new StringBuilder("0.0");
|
||||||
|
|
||||||
|
if (e._2() != null)
|
||||||
|
e._2().forEach(pub -> {
|
||||||
|
try {
|
||||||
|
Organization organization = mapper.readValue(pub, Organization.class);
|
||||||
|
|
||||||
|
final String currentTrust = organization.getDataInfo().getTrust();
|
||||||
|
if (!"1.0".equals(currentTrust)) {
|
||||||
|
trust.setLength(0);
|
||||||
|
trust.append(currentTrust);
|
||||||
|
}
|
||||||
|
o.mergeFrom(organization);
|
||||||
|
|
||||||
|
} catch (Exception exc) {
|
||||||
|
throw new RuntimeException(exc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (o.getDataInfo() == null)
|
||||||
|
{
|
||||||
|
o.setDataInfo(new DataInfo());
|
||||||
|
}
|
||||||
|
if (o.getDataInfo() == null)
|
||||||
|
o.setDataInfo(new DataInfo());
|
||||||
|
o.getDataInfo().setTrust("0.9");
|
||||||
|
o.setLastupdatetimestamp(ts);
|
||||||
|
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
|
OtherResearchProduct o = new OtherResearchProduct(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
o.setId(e._1());
|
||||||
|
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||||
|
|
||||||
|
if (e._2() != null)
|
||||||
|
e._2().forEach(orp -> {
|
||||||
|
try {
|
||||||
|
OtherResearchProduct otherResearchProduct = mapper.readValue(orp, OtherResearchProduct.class);
|
||||||
|
|
||||||
|
o.mergeFrom(otherResearchProduct);
|
||||||
|
o.setAuthor(DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor()));
|
||||||
|
//add to the list if they are not null
|
||||||
|
if (otherResearchProduct.getDateofacceptance() != null)
|
||||||
|
dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue());
|
||||||
|
} catch (Exception exc) {
|
||||||
|
throw new RuntimeException(exc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (o.getDataInfo() == null)
|
||||||
|
o.setDataInfo(new DataInfo());
|
||||||
|
o.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||||
|
o.getDataInfo().setTrust("0.9");
|
||||||
|
o.setLastupdatetimestamp(ts);
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,219 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
import org.apache.commons.codec.binary.Hex;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.spark.SparkContext;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.security.MessageDigest;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
public class DedupUtility {
|
||||||
|
private static final Double THRESHOLD = 0.95;
|
||||||
|
|
||||||
|
public static Map<String, LongAccumulator> constructAccumulator(final DedupConfig dedupConf, final SparkContext context) {
|
||||||
|
|
||||||
|
Map<String, LongAccumulator> accumulators = new HashMap<>();
|
||||||
|
|
||||||
|
String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
|
||||||
|
accumulators.put(acc1, context.longAccumulator(acc1));
|
||||||
|
String acc2 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
|
||||||
|
accumulators.put(acc2, context.longAccumulator(acc2));
|
||||||
|
String acc3 = String.format("%s::%s", dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
|
||||||
|
accumulators.put(acc3, context.longAccumulator(acc3));
|
||||||
|
String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
|
||||||
|
accumulators.put(acc4, context.longAccumulator(acc4));
|
||||||
|
String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
|
||||||
|
accumulators.put(acc5, context.longAccumulator(acc5));
|
||||||
|
String acc6 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
|
||||||
|
accumulators.put(acc6, context.longAccumulator(acc6));
|
||||||
|
|
||||||
|
return accumulators;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static JavaRDD<String> loadDataFromHDFS(String path, JavaSparkContext context) {
|
||||||
|
return context.textFile(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void deleteIfExists(String path) throws IOException {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
FileSystem fileSystem = FileSystem.get(conf);
|
||||||
|
if (fileSystem.exists(new Path(path))) {
|
||||||
|
fileSystem.delete(new Path(path), true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static DedupConfig loadConfigFromHDFS(String path) throws IOException {
|
||||||
|
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
FileSystem fileSystem = FileSystem.get(conf);
|
||||||
|
FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path)));
|
||||||
|
|
||||||
|
return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name()));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||||
|
final StringWriter sw = new StringWriter();
|
||||||
|
try {
|
||||||
|
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||||
|
return sw.toString();
|
||||||
|
} catch (final IOException e) {
|
||||||
|
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
|
||||||
|
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String md5(final String s) {
|
||||||
|
try {
|
||||||
|
final MessageDigest md = MessageDigest.getInstance("MD5");
|
||||||
|
md.update(s.getBytes("UTF-8"));
|
||||||
|
return new String(Hex.encodeHex(md.digest()));
|
||||||
|
} catch (final Exception e) {
|
||||||
|
System.err.println("Error creating id");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
||||||
|
int pa = countAuthorsPids(a);
|
||||||
|
int pb = countAuthorsPids(b);
|
||||||
|
List<Author> base, enrich;
|
||||||
|
int sa = authorsSize(a);
|
||||||
|
int sb = authorsSize(b);
|
||||||
|
|
||||||
|
if (pa == pb) {
|
||||||
|
base = sa > sb ? a : b;
|
||||||
|
enrich = sa > sb ? b : a;
|
||||||
|
} else {
|
||||||
|
base = pa > pb ? a : b;
|
||||||
|
enrich = pa > pb ? b : a;
|
||||||
|
}
|
||||||
|
enrichPidFromList(base, enrich);
|
||||||
|
return base;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
||||||
|
if (base == null || enrich == null)
|
||||||
|
return;
|
||||||
|
final Map<String, Author> basePidAuthorMap = base.stream()
|
||||||
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
|
.flatMap(a -> a.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(p -> new Tuple2<>(p.toComparableString(), a))
|
||||||
|
).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||||
|
|
||||||
|
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
||||||
|
.stream()
|
||||||
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
|
.flatMap(a -> a.getPid().stream().filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())).map(p -> new Tuple2<>(p, a)))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
|
||||||
|
pidToEnrich.forEach(a -> {
|
||||||
|
Optional<Tuple2<Double, Author>> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1));
|
||||||
|
if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
|
||||||
|
Author r = simAuhtor.get()._2();
|
||||||
|
r.getPid().add(a._1());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String createEntityPath(final String basePath, final String entityType) {
|
||||||
|
return String.format("%s/%s", basePath, entityType);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String createSimRelPath(final String basePath, final String entityType) {
|
||||||
|
return String.format("%s/%s_simRel", basePath, entityType);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String createMergeRelPath(final String basePath, final String entityType) {
|
||||||
|
return String.format("%s/%s_mergeRel", basePath, entityType);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double sim(Author a, Author b) {
|
||||||
|
|
||||||
|
final Person pa = parse(a);
|
||||||
|
final Person pb = parse(b);
|
||||||
|
|
||||||
|
if (pa.isAccurate() & pb.isAccurate()) {
|
||||||
|
return new JaroWinkler().score(
|
||||||
|
normalize(pa.getSurnameString()),
|
||||||
|
normalize(pb.getSurnameString()));
|
||||||
|
} else {
|
||||||
|
return new JaroWinkler().score(
|
||||||
|
normalize(pa.getNormalisedFullname()),
|
||||||
|
normalize(pb.getNormalisedFullname()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String normalize(final String s) {
|
||||||
|
return nfd(s).toLowerCase()
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||||
|
.replaceAll("(\\W)+", " ")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Person parse(Author author) {
|
||||||
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
||||||
|
return new Person(author.getSurname() + ", " + author.getName(), false);
|
||||||
|
} else {
|
||||||
|
return new Person(author.getFullname(), false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static int countAuthorsPids(List<Author> authors) {
|
||||||
|
if (authors == null)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return (int) authors.stream().filter(DedupUtility::hasPid).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int authorsSize(List<Author> authors) {
|
||||||
|
if (authors == null)
|
||||||
|
return 0;
|
||||||
|
return authors.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean hasPid(Author a) {
|
||||||
|
if (a == null || a.getPid() == null || a.getPid().size() == 0)
|
||||||
|
return false;
|
||||||
|
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,162 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
import eu.dnetlib.pace.util.BlockProcessor;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.Function2;
|
||||||
|
import org.apache.spark.api.java.function.PairFlatMapFunction;
|
||||||
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import scala.Serializable;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class Deduper implements Serializable {
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(Deduper.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the list of relations generated by the deduplication
|
||||||
|
* @param: the spark context
|
||||||
|
* @param: list of JSON entities to be deduped
|
||||||
|
* @param: the dedup configuration
|
||||||
|
*/
|
||||||
|
public static JavaPairRDD<String, String> dedup(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
|
||||||
|
|
||||||
|
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
|
||||||
|
|
||||||
|
//create vertexes of the graph: <ID, MapDocument>
|
||||||
|
JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
|
||||||
|
|
||||||
|
|
||||||
|
//create blocks for deduplication
|
||||||
|
JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
|
||||||
|
|
||||||
|
//create relations by comparing only elements in the same group
|
||||||
|
return computeRelations(context, blocks, config);
|
||||||
|
|
||||||
|
// final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(), it._2().hashCode(), "equalTo")).rdd();
|
||||||
|
//
|
||||||
|
// RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t -> new Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2())).rdd();
|
||||||
|
// accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
|
||||||
|
//
|
||||||
|
// return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the list of relations generated by the deduplication
|
||||||
|
* @param: the spark context
|
||||||
|
* @param: list of blocks
|
||||||
|
* @param: the dedup configuration
|
||||||
|
*/
|
||||||
|
public static JavaPairRDD<String, String> computeRelations(JavaSparkContext context, JavaPairRDD<String, Iterable<MapDocument>> blocks, DedupConfig config) {
|
||||||
|
|
||||||
|
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
|
||||||
|
|
||||||
|
return blocks.flatMapToPair((PairFlatMapFunction<Tuple2<String, Iterable<MapDocument>>, String, String>) it -> {
|
||||||
|
final SparkReporter reporter = new SparkReporter(accumulators);
|
||||||
|
new BlockProcessor(config).process(it._1(), it._2(), reporter);
|
||||||
|
return reporter.getRelations().iterator();
|
||||||
|
|
||||||
|
}).mapToPair(
|
||||||
|
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item ->
|
||||||
|
new Tuple2<String, Tuple2<String, String>>(item._1() + item._2(), item))
|
||||||
|
.reduceByKey((a, b) -> a)
|
||||||
|
.mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the list of blocks based on clustering of dedup configuration
|
||||||
|
* @param: the spark context
|
||||||
|
* @param: list of entities: <id, entity>
|
||||||
|
* @param: the dedup configuration
|
||||||
|
*/
|
||||||
|
public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
|
||||||
|
return mapDocs
|
||||||
|
//the reduce is just to be sure that we haven't document with same id
|
||||||
|
.reduceByKey((a, b) -> a)
|
||||||
|
.map(Tuple2::_2)
|
||||||
|
//Clustering: from <id, doc> to List<groupkey,doc>
|
||||||
|
.flatMapToPair((PairFlatMapFunction<MapDocument, String, MapDocument>) a ->
|
||||||
|
DedupUtility.getGroupingKeys(config, a)
|
||||||
|
.stream()
|
||||||
|
.map(it -> new Tuple2<>(it, a))
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.iterator())
|
||||||
|
.groupByKey();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static JavaPairRDD<String, List<MapDocument>> createsortedBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
|
||||||
|
final String of = config.getWf().getOrderField();
|
||||||
|
final int maxQueueSize = config.getWf().getGroupMaxSize();
|
||||||
|
return mapDocs
|
||||||
|
//the reduce is just to be sure that we haven't document with same id
|
||||||
|
.reduceByKey((a, b) -> a)
|
||||||
|
.map(Tuple2::_2)
|
||||||
|
//Clustering: from <id, doc> to List<groupkey,doc>
|
||||||
|
.flatMapToPair((PairFlatMapFunction<MapDocument, String, List<MapDocument>>) a ->
|
||||||
|
DedupUtility.getGroupingKeys(config, a)
|
||||||
|
.stream()
|
||||||
|
.map(it -> {
|
||||||
|
List<MapDocument> tmp = new ArrayList<>();
|
||||||
|
tmp.add(a);
|
||||||
|
return new Tuple2<>(it, tmp);
|
||||||
|
}
|
||||||
|
)
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.iterator())
|
||||||
|
.reduceByKey((Function2<List<MapDocument>, List<MapDocument>, List<MapDocument>>) (v1, v2) -> {
|
||||||
|
v1.addAll(v2);
|
||||||
|
v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue()));
|
||||||
|
if (v1.size() > maxQueueSize)
|
||||||
|
return new ArrayList<>(v1.subList(0, maxQueueSize));
|
||||||
|
return v1;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the list of vertexes: <id, mapDocument>
|
||||||
|
* @param: the spark context
|
||||||
|
* @param: list of JSON entities
|
||||||
|
* @param: the dedup configuration
|
||||||
|
*/
|
||||||
|
public static JavaPairRDD<String, MapDocument> mapToVertexes(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
|
||||||
|
|
||||||
|
return entities.mapToPair((PairFunction<String, String, MapDocument>) s -> {
|
||||||
|
|
||||||
|
MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s);
|
||||||
|
return new Tuple2<String, MapDocument>(mapDocument.getIdentifier(), mapDocument);
|
||||||
|
|
||||||
|
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static JavaPairRDD<String, String> computeRelations2(JavaSparkContext context, JavaPairRDD<String, List<MapDocument>> blocks, DedupConfig config) {
|
||||||
|
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
|
||||||
|
|
||||||
|
return blocks.flatMapToPair((PairFlatMapFunction<Tuple2<String, List<MapDocument>>, String, String>) it -> {
|
||||||
|
try {
|
||||||
|
final SparkReporter reporter = new SparkReporter(accumulators);
|
||||||
|
new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter);
|
||||||
|
return reporter.getRelations().iterator();
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(it._2().get(0).getIdentifier(), e);
|
||||||
|
}
|
||||||
|
}).mapToPair(
|
||||||
|
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item ->
|
||||||
|
new Tuple2<String, Tuple2<String, String>>(item._1() + item._2(), item))
|
||||||
|
.reduceByKey((a, b) -> a)
|
||||||
|
.mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,15 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
public enum OafEntityType {
|
||||||
|
|
||||||
|
datasource,
|
||||||
|
organization,
|
||||||
|
project,
|
||||||
|
dataset,
|
||||||
|
otherresearchproduct,
|
||||||
|
software,
|
||||||
|
publication
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,79 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import com.google.common.collect.Iterables;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import com.google.common.hash.HashFunction;
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import eu.dnetlib.dedup.graph.ConnectedComponent;
|
||||||
|
import eu.dnetlib.dedup.graph.GraphProcessor;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
import org.apache.spark.graphx.Edge;
|
||||||
|
import org.apache.spark.rdd.RDD;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class SparkCreateConnectedComponent {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
final SparkSession spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(SparkCreateConnectedComponent.class.getSimpleName())
|
||||||
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate();
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
final String inputPath = parser.get("sourcePath");
|
||||||
|
final String entity = parser.get("entity");
|
||||||
|
final String targetPath = parser.get("targetPath");
|
||||||
|
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||||
|
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||||
|
|
||||||
|
final JavaPairRDD<Object, String> vertexes = sc.textFile(inputPath + "/" + entity)
|
||||||
|
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||||
|
.mapToPair((PairFunction<String, Object, String>)
|
||||||
|
s -> new Tuple2<Object, String>(getHashcode(s), s)
|
||||||
|
);
|
||||||
|
|
||||||
|
final Dataset<Relation> similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class));
|
||||||
|
final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd();
|
||||||
|
final JavaRDD<ConnectedComponent> cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD();
|
||||||
|
final Dataset<Relation> mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction<ConnectedComponent, Relation>) c ->
|
||||||
|
c.getDocIds()
|
||||||
|
.stream()
|
||||||
|
.flatMap(id -> {
|
||||||
|
List<Relation> tmp = new ArrayList<>();
|
||||||
|
Relation r = new Relation();
|
||||||
|
r.setSource(c.getCcId());
|
||||||
|
r.setTarget(id);
|
||||||
|
r.setRelClass("merges");
|
||||||
|
tmp.add(r);
|
||||||
|
r = new Relation();
|
||||||
|
r.setTarget(c.getCcId());
|
||||||
|
r.setSource(id);
|
||||||
|
r.setRelClass("isMergedIn");
|
||||||
|
tmp.add(r);
|
||||||
|
return tmp.stream();
|
||||||
|
}).iterator()).rdd(), Encoders.bean(Relation.class));
|
||||||
|
mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long getHashcode(final String id) {
|
||||||
|
return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,39 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
|
||||||
|
public class SparkCreateDedupRecord {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
final SparkSession spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(SparkCreateDedupRecord.class.getSimpleName())
|
||||||
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate();
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
final String sourcePath = parser.get("sourcePath");
|
||||||
|
final String entity = parser.get("entity");
|
||||||
|
final String dedupPath = parser.get("dedupPath");
|
||||||
|
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||||
|
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||||
|
|
||||||
|
final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf);
|
||||||
|
dedupRecord.map(r-> {
|
||||||
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
|
return mapper.writeValueAsString(r);
|
||||||
|
}).saveAsTextFile(dedupPath+"/"+entity+"_dedup_record_json");
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,73 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This Spark class creates similarity relations between entities, saving result
|
||||||
|
*
|
||||||
|
* param request:
|
||||||
|
* sourcePath
|
||||||
|
* entityType
|
||||||
|
* target Path
|
||||||
|
*/
|
||||||
|
public class SparkCreateSimRels {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
final SparkSession spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(SparkCreateSimRels.class.getSimpleName())
|
||||||
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate();
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
final String inputPath = parser.get("sourcePath");
|
||||||
|
final String entity = parser.get("entity");
|
||||||
|
final String targetPath = parser.get("targetPath");
|
||||||
|
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||||
|
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||||
|
|
||||||
|
final long total = sc.textFile(inputPath + "/" + entity).count();
|
||||||
|
|
||||||
|
JavaPairRDD<String, MapDocument> mapDocument = sc.textFile(inputPath + "/" + entity)
|
||||||
|
.mapToPair(s->{
|
||||||
|
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s);
|
||||||
|
return new Tuple2<>(d.getIdentifier(), d);});
|
||||||
|
|
||||||
|
//create blocks for deduplication
|
||||||
|
JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
|
||||||
|
// JavaPairRDD<String, Iterable<MapDocument>> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf);
|
||||||
|
|
||||||
|
//create relations by comparing only elements in the same group
|
||||||
|
final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
|
||||||
|
// final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations(sc, blocks, dedupConf);
|
||||||
|
|
||||||
|
final JavaRDD<Relation> isSimilarToRDD = dedupRels.map(simRel -> {
|
||||||
|
final Relation r = new Relation();
|
||||||
|
r.setSource(simRel._1());
|
||||||
|
r.setTarget(simRel._2());
|
||||||
|
r.setRelClass("isSimilarTo");
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
|
||||||
|
spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity));
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,47 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.util.Reporter;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import scala.Serializable;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class SparkReporter implements Serializable, Reporter {
|
||||||
|
|
||||||
|
final List<Tuple2<String, String>> relations = new ArrayList<>();
|
||||||
|
private static final Log log = LogFactory.getLog(SparkReporter.class);
|
||||||
|
Map<String, LongAccumulator> accumulators;
|
||||||
|
|
||||||
|
public SparkReporter(Map<String, LongAccumulator> accumulators){
|
||||||
|
this.accumulators = accumulators;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incrementCounter(String counterGroup, String counterName, long delta, Map<String, LongAccumulator> accumulators) {
|
||||||
|
|
||||||
|
final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
|
||||||
|
if (accumulators.containsKey(accumulatorName)){
|
||||||
|
accumulators.get(accumulatorName).add(delta);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void incrementCounter(String counterGroup, String counterName, long delta) {
|
||||||
|
|
||||||
|
incrementCounter(counterGroup, counterName, delta, accumulators);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void emit(String type, String from, String to) {
|
||||||
|
relations.add(new Tuple2<>(from, to));
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Tuple2<String, String>> getRelations() {
|
||||||
|
return relations;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,80 @@
|
||||||
|
package eu.dnetlib.dedup.graph;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dedup.DedupUtility;
|
||||||
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class ConnectedComponent implements Serializable {
|
||||||
|
|
||||||
|
private Set<String> docIds;
|
||||||
|
private String ccId;
|
||||||
|
|
||||||
|
|
||||||
|
public ConnectedComponent() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public ConnectedComponent(Set<String> docIds) {
|
||||||
|
this.docIds = docIds;
|
||||||
|
createID();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String createID() {
|
||||||
|
if (docIds.size() > 1) {
|
||||||
|
final String s = getMin();
|
||||||
|
String prefix = s.split("\\|")[0];
|
||||||
|
ccId =prefix + "|dedup_______::" + DedupUtility.md5(s);
|
||||||
|
return ccId;
|
||||||
|
} else {
|
||||||
|
return docIds.iterator().next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@JsonIgnore
|
||||||
|
public String getMin(){
|
||||||
|
|
||||||
|
final StringBuilder min = new StringBuilder();
|
||||||
|
docIds.forEach(i -> {
|
||||||
|
if (StringUtils.isBlank(min.toString())) {
|
||||||
|
min.append(i);
|
||||||
|
} else {
|
||||||
|
if (min.toString().compareTo(i) > 0) {
|
||||||
|
min.setLength(0);
|
||||||
|
min.append(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return min.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString(){
|
||||||
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
|
try {
|
||||||
|
return mapper.writeValueAsString(this);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new PaceException("Failed to create Json: ", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> getDocIds() {
|
||||||
|
return docIds;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDocIds(Set<String> docIds) {
|
||||||
|
this.docIds = docIds;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCcId() {
|
||||||
|
return ccId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCcId(String ccId) {
|
||||||
|
this.ccId = ccId;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,37 @@
|
||||||
|
package eu.dnetlib.dedup.graph
|
||||||
|
|
||||||
|
import org.apache.spark.graphx._
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
|
||||||
|
import scala.collection.JavaConversions;
|
||||||
|
|
||||||
|
object GraphProcessor {
|
||||||
|
|
||||||
|
def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
|
||||||
|
val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
|
||||||
|
val cc = graph.connectedComponents(maxIterations).vertices
|
||||||
|
|
||||||
|
val joinResult = vertexes.leftOuterJoin(cc).map {
|
||||||
|
case (id, (openaireId, cc)) => {
|
||||||
|
if (cc.isEmpty) {
|
||||||
|
(id, openaireId)
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
(cc.get, openaireId)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
val connectedComponents = joinResult.groupByKey()
|
||||||
|
.map[ConnectedComponent](cc => asConnectedComponent(cc))
|
||||||
|
connectedComponents
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
|
||||||
|
val docs = group._2.toSet[String]
|
||||||
|
val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
|
||||||
|
connectedComponent
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "mt",
|
||||||
|
"paramLongName": "master",
|
||||||
|
"paramDescription": "should be local or yarn",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the path of the sequential file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "e",
|
||||||
|
"paramLongName": "entity",
|
||||||
|
"paramDescription": "the type of entity to be deduped",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "c",
|
||||||
|
"paramLongName": "dedupConf",
|
||||||
|
"paramDescription": "dedup configuration to be used",
|
||||||
|
"compressed": true,
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "d",
|
||||||
|
"paramLongName": "dedupPath",
|
||||||
|
"paramDescription": "dedup path to load mergeRelation",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,33 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "mt",
|
||||||
|
"paramLongName": "master",
|
||||||
|
"paramDescription": "should be local or yarn",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the path of the sequential file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "e",
|
||||||
|
"paramLongName": "entity",
|
||||||
|
"paramDescription": "the type of entity to be deduped",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "c",
|
||||||
|
"paramLongName": "dedupConf",
|
||||||
|
"paramDescription": "dedup configuration to be used",
|
||||||
|
"compressed": true,
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "t",
|
||||||
|
"paramLongName": "targetPath",
|
||||||
|
"paramDescription": "target path to save dedup result",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,26 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_metastore_uris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_db_name</name>
|
||||||
|
<value>openaire</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,126 @@
|
||||||
|
<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>entity</name>
|
||||||
|
<description>the entity that should be processed</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>dedupConf</name>
|
||||||
|
<description>the dedup Configuration</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>targetPath</name>
|
||||||
|
<description>the target path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="CreateSimRels"/>
|
||||||
|
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<!-- <action name="DeleteTargetPath">-->
|
||||||
|
<!-- <fs>-->
|
||||||
|
<!-- <delete path='${targetPath}/${entity}_simrel'/>-->
|
||||||
|
<!-- <delete path='${targetPath}/${entity}_mergeRels'/>-->
|
||||||
|
<!-- </fs>-->
|
||||||
|
<!-- <ok to="CreateSimRels"/>-->
|
||||||
|
<!-- <error to="Kill"/>-->
|
||||||
|
<!-- </action>-->
|
||||||
|
|
||||||
|
<action name="CreateSimRels">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Create Similarity Relations</name>
|
||||||
|
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
|
||||||
|
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory} --conf
|
||||||
|
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
|
||||||
|
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
|
||||||
|
spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-mt</arg><arg>yarn-cluster</arg>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
|
<arg>--entity</arg><arg>${entity}</arg>
|
||||||
|
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="CreateConnectedComponents"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="CreateConnectedComponents">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Create Connected Components</name>
|
||||||
|
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
|
||||||
|
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory} --conf
|
||||||
|
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
|
||||||
|
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
|
||||||
|
spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-mt</arg><arg>yarn-cluster</arg>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
|
<arg>--entity</arg><arg>${entity}</arg>
|
||||||
|
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="CreateDedupRecord"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="CreateDedupRecord">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Create Dedup Record</name>
|
||||||
|
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
|
||||||
|
<jar>dhp-dedup-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory} --conf
|
||||||
|
spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
|
||||||
|
spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
|
||||||
|
spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-mt</arg><arg>yarn-cluster</arg>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--dedupPath</arg><arg>${dedupPath}</arg>
|
||||||
|
<arg>--entity</arg><arg>${entity}</arg>
|
||||||
|
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,61 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class MergeAuthorTest {
|
||||||
|
|
||||||
|
List<Publication> publicationsToMerge;
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dedup/json/authors_merge.json"));
|
||||||
|
|
||||||
|
|
||||||
|
publicationsToMerge = Arrays.asList(json.split("\n")).stream().map(s-> {
|
||||||
|
try {
|
||||||
|
return mapper.readValue(s, Publication.class);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}).collect(Collectors.toList());
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws Exception {
|
||||||
|
Publication dedup = new Publication();
|
||||||
|
|
||||||
|
|
||||||
|
publicationsToMerge.forEach(p-> {
|
||||||
|
dedup.mergeFrom(p);
|
||||||
|
dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(),p.getAuthor()));
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
System.out.println(mapper.writeValueAsString(dedup));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,86 @@
|
||||||
|
package eu.dnetlib.dedup;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.hash.HashFunction;
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Ignore;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class SparkCreateDedupTest {
|
||||||
|
|
||||||
|
String configuration;
|
||||||
|
String entity = "organization";
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Ignore
|
||||||
|
public void createSimRelsTest() throws Exception {
|
||||||
|
SparkCreateSimRels.main(new String[] {
|
||||||
|
"-mt", "local[*]",
|
||||||
|
"-s", "/Users/miconis/dumps",
|
||||||
|
"-e", entity,
|
||||||
|
"-c", ArgumentApplicationParser.compressArgument(configuration),
|
||||||
|
"-t", "/tmp/dedup",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Ignore
|
||||||
|
public void createCCTest() throws Exception {
|
||||||
|
|
||||||
|
SparkCreateConnectedComponent.main(new String[] {
|
||||||
|
"-mt", "local[*]",
|
||||||
|
"-s", "/Users/miconis/dumps",
|
||||||
|
"-e", entity,
|
||||||
|
"-c", ArgumentApplicationParser.compressArgument(configuration),
|
||||||
|
"-t", "/tmp/dedup",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Ignore
|
||||||
|
public void dedupRecordTest() throws Exception {
|
||||||
|
SparkCreateDedupRecord.main(new String[] {
|
||||||
|
"-mt", "local[*]",
|
||||||
|
"-s", "/Users/miconis/dumps",
|
||||||
|
"-e", entity,
|
||||||
|
"-c", ArgumentApplicationParser.compressArgument(configuration),
|
||||||
|
"-d", "/tmp/dedup",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void printConfiguration() throws Exception {
|
||||||
|
System.out.println(ArgumentApplicationParser.compressArgument(configuration));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHashCode() {
|
||||||
|
final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f";
|
||||||
|
final String s2 = "20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46";
|
||||||
|
|
||||||
|
final HashFunction hashFunction = Hashing.murmur3_128();
|
||||||
|
|
||||||
|
System.out.println( s1.hashCode());
|
||||||
|
System.out.println(hashFunction.hashUnencodedChars(s1).asLong());
|
||||||
|
System.out.println( s2.hashCode());
|
||||||
|
System.out.println(hashFunction.hashUnencodedChars(s2).asLong());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
package eu.dnetlib.dedup.jpath;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.jayway.jsonpath.JsonPath;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.junit.Test;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class JsonPathTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testJPath () throws Exception {
|
||||||
|
final String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/sample.json"));
|
||||||
|
List<Map<String, Object>> pid = JsonPath.read(json, "$.pid[*]");
|
||||||
|
// System.out.println(json);
|
||||||
|
|
||||||
|
pid.forEach(it -> {
|
||||||
|
try {
|
||||||
|
System.out.println(new ObjectMapper().writeValueAsString(it));
|
||||||
|
} catch (JsonProcessingException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,266 @@
|
||||||
|
{
|
||||||
|
"wf" : {
|
||||||
|
"threshold" : "0.99",
|
||||||
|
"dedupRun" : "001",
|
||||||
|
"entityType" : "organization",
|
||||||
|
"orderField" : "legalname",
|
||||||
|
"queueMaxSize" : "2000",
|
||||||
|
"groupMaxSize" : "50",
|
||||||
|
"slidingWindowSize" : "200",
|
||||||
|
"idPath":"$.id",
|
||||||
|
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||||
|
"includeChildren" : "true",
|
||||||
|
"maxIterations": "20"
|
||||||
|
},
|
||||||
|
"pace" : {
|
||||||
|
"clustering" : [
|
||||||
|
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||||
|
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||||
|
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||||
|
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||||
|
],
|
||||||
|
"decisionTree" : {
|
||||||
|
"start": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "gridid",
|
||||||
|
"comparator": "exactMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1,
|
||||||
|
"aggregation": "AVG",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "layer2",
|
||||||
|
"ignoreUndefined": "false"
|
||||||
|
},
|
||||||
|
"layer2": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "websiteurl",
|
||||||
|
"comparator": "domainExactMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "country",
|
||||||
|
"comparator": "exactMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "numbersMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "romansMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1,
|
||||||
|
"aggregation": "AND",
|
||||||
|
"positive": "layer3",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "layer3",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer3": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "cityMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
"windowSize": "4"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.7,
|
||||||
|
"aggregation": "W_MEAN",
|
||||||
|
"positive": "layer4",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "NO_MATCH",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer4": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "keywordMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
"windowSize": "4"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.9,
|
||||||
|
"aggregation": "AVG",
|
||||||
|
"positive": "layer5",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "layer5",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer5": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "jaroWinklerNormalizedName",
|
||||||
|
"weight": 0.9,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
"windowSize": "4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "legalshortname",
|
||||||
|
"comparator": "jaroWinklerNormalizedName",
|
||||||
|
"weight": 0.1,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.9,
|
||||||
|
"aggregation": "W_MEAN",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "NO_MATCH",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"model" : [
|
||||||
|
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
|
||||||
|
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
|
||||||
|
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
|
||||||
|
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
||||||
|
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"},
|
||||||
|
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
||||||
|
],
|
||||||
|
"blacklists" : {
|
||||||
|
"legalname" : []
|
||||||
|
},
|
||||||
|
"synonyms": {
|
||||||
|
"key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
||||||
|
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||||
|
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||||
|
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||||
|
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
|
||||||
|
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
|
||||||
|
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
|
||||||
|
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
||||||
|
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
||||||
|
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
||||||
|
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
||||||
|
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
||||||
|
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
||||||
|
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
|
||||||
|
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
|
||||||
|
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
|
||||||
|
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
|
||||||
|
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
|
||||||
|
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
|
||||||
|
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
|
||||||
|
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
|
||||||
|
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
|
||||||
|
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
|
||||||
|
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
|
||||||
|
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
|
||||||
|
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
|
||||||
|
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
|
||||||
|
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
|
||||||
|
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
|
||||||
|
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
|
||||||
|
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
|
||||||
|
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
|
||||||
|
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
|
||||||
|
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
|
||||||
|
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
|
||||||
|
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
|
||||||
|
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
|
||||||
|
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
|
||||||
|
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
|
||||||
|
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
|
||||||
|
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
|
||||||
|
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
|
||||||
|
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
|
||||||
|
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
|
||||||
|
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
|
||||||
|
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
|
||||||
|
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
|
||||||
|
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
|
||||||
|
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
|
||||||
|
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
|
||||||
|
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
|
||||||
|
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
|
||||||
|
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
|
||||||
|
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
|
||||||
|
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
|
||||||
|
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
|
||||||
|
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
|
||||||
|
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
|
||||||
|
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
|
||||||
|
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
|
||||||
|
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
|
||||||
|
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
|
||||||
|
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
|
||||||
|
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
|
||||||
|
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
|
||||||
|
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
|
||||||
|
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
|
||||||
|
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
|
||||||
|
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
|
||||||
|
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
|
||||||
|
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
|
||||||
|
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
|
||||||
|
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
|
||||||
|
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
|
||||||
|
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
|
||||||
|
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
|
||||||
|
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
|
||||||
|
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
|
||||||
|
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
|
||||||
|
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
|
||||||
|
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
|
||||||
|
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
|
||||||
|
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
|
||||||
|
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
|
||||||
|
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
|
||||||
|
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
|
||||||
|
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
|
||||||
|
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
|
||||||
|
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
|
||||||
|
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
|
||||||
|
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
|
||||||
|
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
|
||||||
|
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
|
||||||
|
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
|
||||||
|
"key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
|
||||||
|
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
|
||||||
|
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
|
||||||
|
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
|
||||||
|
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
|
||||||
|
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
|
||||||
|
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
|
||||||
|
"key::102": ["informatics","informatica","informática","informática","informatica",""],
|
||||||
|
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
||||||
|
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
|
||||||
|
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
|
||||||
|
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
|
||||||
|
"key::107" : ["agricultural forestry", "af", "a f"],
|
||||||
|
"key::108" : ["agricultural mechanical", "am", "a m"],
|
||||||
|
"key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,280 @@
|
||||||
|
{
|
||||||
|
"wf" : {
|
||||||
|
"threshold" : "0.99",
|
||||||
|
"dedupRun" : "001",
|
||||||
|
"entityType" : "result",
|
||||||
|
"subEntityType" : "resulttype",
|
||||||
|
"subEntityValue" : "publication",
|
||||||
|
"orderField" : "title",
|
||||||
|
"queueMaxSize" : "2000",
|
||||||
|
"groupMaxSize" : "100",
|
||||||
|
"maxChildren" : "100",
|
||||||
|
"idPath": "$.id",
|
||||||
|
"slidingWindowSize" : "200",
|
||||||
|
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||||
|
"includeChildren" : "true"
|
||||||
|
},
|
||||||
|
"pace" : {
|
||||||
|
"clustering" : [
|
||||||
|
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||||
|
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||||
|
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||||
|
],
|
||||||
|
"strictConditions" : [
|
||||||
|
{ "name" : "pidMatch", "fields" : [ "pid" ] }
|
||||||
|
],
|
||||||
|
"conditions" : [
|
||||||
|
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
|
||||||
|
{ "name" : "sizeMatch", "fields" : [ "authors" ] }
|
||||||
|
],
|
||||||
|
"model" : [
|
||||||
|
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid[?(@.qualifier.classid ==\"doi\")].value" },
|
||||||
|
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid", "overrideMatch" : "true" },
|
||||||
|
{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.title[?(@.qualifier.classid ==\"main title\")].value", "length" : 250, "size" : 5 },
|
||||||
|
{ "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.author[*].fullname", "size" : 200 },
|
||||||
|
{ "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "$.resulttype.classid" }
|
||||||
|
],
|
||||||
|
"synonyms": {},
|
||||||
|
"blacklists" : {
|
||||||
|
"title" : [
|
||||||
|
"^Inside Front Cover$",
|
||||||
|
"(?i)^Poster presentations$",
|
||||||
|
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
|
||||||
|
"^Problems with perinatal pathology\\.?$",
|
||||||
|
"(?i)^Cases? of Puerperal Convulsions$",
|
||||||
|
"(?i)^Operative Gyna?ecology$",
|
||||||
|
"(?i)^Mind the gap\\!?\\:?$",
|
||||||
|
"^Chronic fatigue syndrome\\.?$",
|
||||||
|
"^Cartas? ao editor Letters? to the Editor$",
|
||||||
|
"^Note from the Editor$",
|
||||||
|
"^Anesthesia Abstract$",
|
||||||
|
|
||||||
|
"^Annual report$",
|
||||||
|
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
|
||||||
|
"(?i)^Graph and Table of Infectious Diseases?$",
|
||||||
|
"^Presentation$",
|
||||||
|
"(?i)^Reviews and Information on Publications$",
|
||||||
|
"(?i)^PUBLIC HEALTH SERVICES?$",
|
||||||
|
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
|
||||||
|
"(?i)^Adrese autora$",
|
||||||
|
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
|
||||||
|
"(?i)^Acknowledgement to Referees$",
|
||||||
|
"(?i)^Behçet's disease\\.?$",
|
||||||
|
"(?i)^Isolation and identification of restriction endonuclease.*$",
|
||||||
|
"(?i)^CEREBROVASCULAR DISEASES?.?$",
|
||||||
|
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
|
||||||
|
"^Event management$",
|
||||||
|
"(?i)^Breakfast and Crohn's disease.*\\.?$",
|
||||||
|
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
|
||||||
|
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
|
||||||
|
"^Gushi hakubutsugaku$",
|
||||||
|
|
||||||
|
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
|
||||||
|
"^Intestinal spirocha?etosis$",
|
||||||
|
"^Treatment of Rodent Ulcer$",
|
||||||
|
"(?i)^\\W*Cloud Computing\\W*$",
|
||||||
|
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
|
||||||
|
"^Free Communications, Poster Presentations: Session [A-F]$",
|
||||||
|
|
||||||
|
"^“The Historical Aspects? of Quackery\\.?”$",
|
||||||
|
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
|
||||||
|
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
|
||||||
|
"(?i)^Case Report$",
|
||||||
|
"^Boletín Informativo$",
|
||||||
|
"(?i)^Glioblastoma Multiforme$",
|
||||||
|
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
|
||||||
|
"^Zaměstnanecké výhody$",
|
||||||
|
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
|
||||||
|
"(?i)^Carotid body tumours?\\.?$",
|
||||||
|
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
|
||||||
|
"^Avant-propos$",
|
||||||
|
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
|
||||||
|
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
|
||||||
|
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
|
||||||
|
"^Viñetas de Cortázar$",
|
||||||
|
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
|
||||||
|
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
|
||||||
|
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
|
||||||
|
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
|
||||||
|
|
||||||
|
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
|
||||||
|
"^Aus der AGMB$",
|
||||||
|
|
||||||
|
"^Znanstveno-stručni prilozi$",
|
||||||
|
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
|
||||||
|
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
|
||||||
|
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
|
||||||
|
"^Finanční analýza podniku$",
|
||||||
|
"^Financial analysis( of business)?$",
|
||||||
|
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
|
||||||
|
"^Jikken nihon shūshinsho$",
|
||||||
|
"(?i)^CORONER('|s)(s|') INQUESTS$",
|
||||||
|
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
|
||||||
|
"(?i)^Consultants' contract(s)?$",
|
||||||
|
"(?i)^Upute autorima$",
|
||||||
|
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
|
||||||
|
"^Joshi shin kokubun$",
|
||||||
|
"^Kōtō shōgaku dokuhon nōson'yō$",
|
||||||
|
"^Jinjō shōgaku shōka$",
|
||||||
|
"^Shōgaku shūjichō$",
|
||||||
|
"^Nihon joshi dokuhon$",
|
||||||
|
"^Joshi shin dokuhon$",
|
||||||
|
"^Chūtō kanbun dokuhon$",
|
||||||
|
"^Wabun dokuhon$",
|
||||||
|
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
|
||||||
|
"(?i)^cardiac rehabilitation$",
|
||||||
|
"(?i)^Analytical summary$",
|
||||||
|
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
|
||||||
|
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
|
||||||
|
"^Prikazi i osvrti$",
|
||||||
|
"^Rodinný dům s provozovnou$",
|
||||||
|
"^Family house with an establishment$",
|
||||||
|
"^Shinsei chūtō shin kokugun$",
|
||||||
|
"^Pulmonary alveolar proteinosis(\\.?)$",
|
||||||
|
"^Shinshū kanbun$",
|
||||||
|
"^Viñeta(s?) de Rodríguez$",
|
||||||
|
"(?i)^RUBRIKA UREDNIKA$",
|
||||||
|
"^A Matching Model of the Academic Publication Market$",
|
||||||
|
"^Yōgaku kōyō$",
|
||||||
|
|
||||||
|
"^Internetový marketing$",
|
||||||
|
"^Internet marketing$",
|
||||||
|
"^Chūtō kokugo dokuhon$",
|
||||||
|
"^Kokugo dokuhon$",
|
||||||
|
"^Antibiotic Cover for Dental Extraction(s?)$",
|
||||||
|
"^Strategie podniku$",
|
||||||
|
"^Strategy of an Enterprise$",
|
||||||
|
"(?i)^respiratory disease(s?)(\\.?)$",
|
||||||
|
"^Award(s?) for Gallantry in Civil Defence$",
|
||||||
|
"^Podniková kultura$",
|
||||||
|
"^Corporate Culture$",
|
||||||
|
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
|
||||||
|
"^Pracovní motivace$",
|
||||||
|
"^Work Motivation$",
|
||||||
|
"^Kaitei kōtō jogaku dokuhon$",
|
||||||
|
"^Konsolidovaná účetní závěrka$",
|
||||||
|
"^Consolidated Financial Statements$",
|
||||||
|
"(?i)^intracranial tumour(s?)$",
|
||||||
|
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
|
||||||
|
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
|
||||||
|
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
|
||||||
|
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
|
||||||
|
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
|
||||||
|
"^The level of motivation process as a leadership$",
|
||||||
|
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
|
||||||
|
"(?i)^news and events$",
|
||||||
|
"(?i)^NOVOSTI I DOGAĐAJI$",
|
||||||
|
"^Sansū no gakushū$",
|
||||||
|
"^Posouzení informačního systému firmy a návrh změn$",
|
||||||
|
"^Information System Assessment and Proposal for ICT Modification$",
|
||||||
|
"^Stresové zatížení pracovníků ve vybrané profesi$",
|
||||||
|
"^Stress load in a specific job$",
|
||||||
|
|
||||||
|
"^Sunday: Poster Sessions, Pt.*$",
|
||||||
|
"^Monday: Poster Sessions, Pt.*$",
|
||||||
|
"^Wednesday: Poster Sessions, Pt.*",
|
||||||
|
"^Tuesday: Poster Sessions, Pt.*$",
|
||||||
|
|
||||||
|
"^Analýza reklamy$",
|
||||||
|
"^Analysis of advertising$",
|
||||||
|
|
||||||
|
"^Shōgaku shūshinsho$",
|
||||||
|
"^Shōgaku sansū$",
|
||||||
|
"^Shintei joshi kokubun$",
|
||||||
|
"^Taishō joshi kokubun dokuhon$",
|
||||||
|
"^Joshi kokubun$",
|
||||||
|
|
||||||
|
"^Účetní uzávěrka a účetní závěrka v ČR$",
|
||||||
|
"(?i)^The \"?Causes\"? of Cancer$",
|
||||||
|
"^Normas para la publicación de artículos$",
|
||||||
|
"^Editor('|s)(s|') [Rr]eply$",
|
||||||
|
"^Editor(’|s)(s|’) letter$",
|
||||||
|
"^Redaktoriaus žodis$",
|
||||||
|
"^DISCUSSION ON THE PRECEDING PAPER$",
|
||||||
|
"^Kōtō shōgaku shūshinsho jidōyō$",
|
||||||
|
"^Shōgaku nihon rekishi$",
|
||||||
|
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
|
||||||
|
"^Préface$",
|
||||||
|
"^Occupational [Hh]ealth [Ss]ervices.$",
|
||||||
|
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
|
||||||
|
"^Účetní závěrka ve vybraném podniku.*$",
|
||||||
|
"^Financial statements in selected company$",
|
||||||
|
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
|
||||||
|
"^Pseudomyxoma peritonei$",
|
||||||
|
"^Kazalo autora$",
|
||||||
|
|
||||||
|
"(?i)^uvodna riječ$",
|
||||||
|
"^Motivace jako způsob vedení lidí$",
|
||||||
|
"^Motivation as a leadership$",
|
||||||
|
"^Polyfunkční dům$",
|
||||||
|
"^Multi\\-funkcional building$",
|
||||||
|
"^Podnikatelský plán$",
|
||||||
|
"(?i)^Podnikatelský záměr$",
|
||||||
|
"(?i)^Business Plan$",
|
||||||
|
"^Oceňování nemovitostí$",
|
||||||
|
"^Marketingová komunikace$",
|
||||||
|
"^Marketing communication$",
|
||||||
|
"^Sumario Analítico$",
|
||||||
|
"^Riječ uredništva$",
|
||||||
|
"^Savjetovanja i priredbe$",
|
||||||
|
"^Índice$",
|
||||||
|
"^(Starobosanski nadpisi).*$",
|
||||||
|
"^Vzdělávání pracovníků v organizaci$",
|
||||||
|
"^Staff training in organization$",
|
||||||
|
"^(Life Histories of North American Geometridae).*$",
|
||||||
|
"^Strategická analýza podniku$",
|
||||||
|
"^Strategic Analysis of an Enterprise$",
|
||||||
|
"^Sadržaj$",
|
||||||
|
"^Upute suradnicima$",
|
||||||
|
"^Rodinný dům$",
|
||||||
|
"(?i)^Fami(l)?ly house$",
|
||||||
|
"^Upute autorima$",
|
||||||
|
"^Strategic Analysis$",
|
||||||
|
"^Finanční analýza vybraného podniku$",
|
||||||
|
"^Finanční analýza$",
|
||||||
|
"^Riječ urednika$",
|
||||||
|
"(?i)^Content(s?)$",
|
||||||
|
"(?i)^Inhalt$",
|
||||||
|
"^Jinjō shōgaku shūshinsho jidōyō$",
|
||||||
|
"(?i)^Index$",
|
||||||
|
"^Chūgaku kokubun kyōkasho$",
|
||||||
|
"^Retrato de una mujer$",
|
||||||
|
"^Retrato de un hombre$",
|
||||||
|
"^Kōtō shōgaku dokuhon$",
|
||||||
|
"^Shotōka kokugo$",
|
||||||
|
"^Shōgaku dokuhon$",
|
||||||
|
"^Jinjō shōgaku kokugo dokuhon$",
|
||||||
|
"^Shinsei kokugo dokuhon$",
|
||||||
|
"^Teikoku dokuhon$",
|
||||||
|
"^Instructions to Authors$",
|
||||||
|
"^KİTAP TAHLİLİ$",
|
||||||
|
"^PRZEGLĄD PIŚMIENNICTWA$",
|
||||||
|
"(?i)^Presentación$",
|
||||||
|
"^İçindekiler$",
|
||||||
|
"(?i)^Tabl?e of contents$",
|
||||||
|
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
|
||||||
|
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
|
||||||
|
"^Editorial( Board)?$",
|
||||||
|
"(?i)^Editorial \\(English\\)$",
|
||||||
|
"^Editörden$",
|
||||||
|
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
|
||||||
|
"^(Kiri Karl Morgensternile).*$",
|
||||||
|
"^(\\[Eksliibris Aleksandr).*\\]$",
|
||||||
|
"^(\\[Eksliibris Aleksandr).*$",
|
||||||
|
"^(Eksliibris Aleksandr).*$",
|
||||||
|
"^(Kiri A\\. de Vignolles).*$",
|
||||||
|
"^(2 kirja Karl Morgensternile).*$",
|
||||||
|
"^(Pirita kloostri idaosa arheoloogilised).*$",
|
||||||
|
"^(Kiri tundmatule).*$",
|
||||||
|
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
|
||||||
|
"^(Eksliibris Nikolai Birukovile).*$",
|
||||||
|
"^(Eksliibris Nikolai Issakovile).*$",
|
||||||
|
"^(WHP Cruise Summary Information of section).*$",
|
||||||
|
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
|
||||||
|
"^(Measurement of the spin\\-dependent structure function).*",
|
||||||
|
"(?i)^.*authors['’′]? reply\\.?$",
|
||||||
|
"(?i)^.*authors['’′]? response\\.?$"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,386 @@
|
||||||
|
{
|
||||||
|
"wf": {
|
||||||
|
"threshold": "0.99",
|
||||||
|
"dedupRun": "001",
|
||||||
|
"entityType": "result",
|
||||||
|
"subEntityType": "resulttype",
|
||||||
|
"subEntityValue": "publication",
|
||||||
|
"orderField": "title",
|
||||||
|
"queueMaxSize": "2000",
|
||||||
|
"groupMaxSize": "100",
|
||||||
|
"maxChildren": "100",
|
||||||
|
"slidingWindowSize": "200",
|
||||||
|
"rootBuilder": [
|
||||||
|
"result",
|
||||||
|
"resultProject_outcome_isProducedBy",
|
||||||
|
"resultResult_publicationDataset_isRelatedTo",
|
||||||
|
"resultResult_similarity_isAmongTopNSimilarDocuments",
|
||||||
|
"resultResult_similarity_hasAmongTopNSimilarDocuments",
|
||||||
|
"resultOrganization_affiliation_isAffiliatedWith",
|
||||||
|
"resultResult_part_hasPart",
|
||||||
|
"resultResult_part_isPartOf",
|
||||||
|
"resultResult_supplement_isSupplementTo",
|
||||||
|
"resultResult_supplement_isSupplementedBy",
|
||||||
|
"resultResult_version_isVersionOf"
|
||||||
|
],
|
||||||
|
"includeChildren": "true",
|
||||||
|
"maxIterations": 20,
|
||||||
|
"idPath": "$.id"
|
||||||
|
},
|
||||||
|
"pace": {
|
||||||
|
"clustering": [
|
||||||
|
{
|
||||||
|
"name": "ngrampairs",
|
||||||
|
"fields": [
|
||||||
|
"title"
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"max": "1",
|
||||||
|
"ngramLen": "3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "suffixprefix",
|
||||||
|
"fields": [
|
||||||
|
"title"
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"max": "1",
|
||||||
|
"len": "3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "lowercase",
|
||||||
|
"fields": [
|
||||||
|
"doi"
|
||||||
|
],
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"decisionTree": {
|
||||||
|
"start": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "pid",
|
||||||
|
"comparator": "jsonListMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {
|
||||||
|
"jpath_value": "$.value",
|
||||||
|
"jpath_classid": "$.qualifier.classid"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.5,
|
||||||
|
"aggregation": "AVG",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "layer2",
|
||||||
|
"undefined": "layer2",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer2": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "title",
|
||||||
|
"comparator": "titleVersionMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "authors",
|
||||||
|
"comparator": "sizeMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1.0,
|
||||||
|
"aggregation": "AND",
|
||||||
|
"positive": "layer3",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "layer3",
|
||||||
|
"ignoreUndefined": "false"
|
||||||
|
},
|
||||||
|
"layer3": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "title",
|
||||||
|
"comparator": "levensteinTitle",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.99,
|
||||||
|
"aggregation": "AVG",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "NO_MATCH",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"model": [
|
||||||
|
{
|
||||||
|
"name": "doi",
|
||||||
|
"type": "String",
|
||||||
|
"path": "$.pid[?(@.qualifier.classid == 'doi')].value"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "pid",
|
||||||
|
"type": "JSON",
|
||||||
|
"path": "$.pid",
|
||||||
|
"overrideMatch": "true"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "title",
|
||||||
|
"type": "String",
|
||||||
|
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||||
|
"length": 250,
|
||||||
|
"size": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "authors",
|
||||||
|
"type": "List",
|
||||||
|
"path": "$.author[*].fullname",
|
||||||
|
"size": 200
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "resulttype",
|
||||||
|
"type": "String",
|
||||||
|
"path": "$.resulttype.classid"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"blacklists": {
|
||||||
|
"title": [
|
||||||
|
"^Inside Front Cover$",
|
||||||
|
"(?i)^Poster presentations$",
|
||||||
|
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
|
||||||
|
"^Problems with perinatal pathology\\.?$",
|
||||||
|
"(?i)^Cases? of Puerperal Convulsions$",
|
||||||
|
"(?i)^Operative Gyna?ecology$",
|
||||||
|
"(?i)^Mind the gap\\!?\\:?$",
|
||||||
|
"^Chronic fatigue syndrome\\.?$",
|
||||||
|
"^Cartas? ao editor Letters? to the Editor$",
|
||||||
|
"^Note from the Editor$",
|
||||||
|
"^Anesthesia Abstract$",
|
||||||
|
"^Annual report$",
|
||||||
|
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
|
||||||
|
"(?i)^Graph and Table of Infectious Diseases?$",
|
||||||
|
"^Presentation$",
|
||||||
|
"(?i)^Reviews and Information on Publications$",
|
||||||
|
"(?i)^PUBLIC HEALTH SERVICES?$",
|
||||||
|
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
|
||||||
|
"(?i)^Adrese autora$",
|
||||||
|
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
|
||||||
|
"(?i)^Acknowledgement to Referees$",
|
||||||
|
"(?i)^Behçet's disease\\.?$",
|
||||||
|
"(?i)^Isolation and identification of restriction endonuclease.*$",
|
||||||
|
"(?i)^CEREBROVASCULAR DISEASES?.?$",
|
||||||
|
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
|
||||||
|
"^Event management$",
|
||||||
|
"(?i)^Breakfast and Crohn's disease.*\\.?$",
|
||||||
|
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
|
||||||
|
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
|
||||||
|
"^Gushi hakubutsugaku$",
|
||||||
|
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
|
||||||
|
"^Intestinal spirocha?etosis$",
|
||||||
|
"^Treatment of Rodent Ulcer$",
|
||||||
|
"(?i)^\\W*Cloud Computing\\W*$",
|
||||||
|
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
|
||||||
|
"^Free Communications, Poster Presentations: Session [A-F]$",
|
||||||
|
"^“The Historical Aspects? of Quackery\\.?”$",
|
||||||
|
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
|
||||||
|
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
|
||||||
|
"(?i)^Case Report$",
|
||||||
|
"^Boletín Informativo$",
|
||||||
|
"(?i)^Glioblastoma Multiforme$",
|
||||||
|
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
|
||||||
|
"^Zaměstnanecké výhody$",
|
||||||
|
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
|
||||||
|
"(?i)^Carotid body tumours?\\.?$",
|
||||||
|
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
|
||||||
|
"^Avant-propos$",
|
||||||
|
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
|
||||||
|
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
|
||||||
|
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
|
||||||
|
"^Viñetas de Cortázar$",
|
||||||
|
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
|
||||||
|
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
|
||||||
|
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
|
||||||
|
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
|
||||||
|
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
|
||||||
|
"^Aus der AGMB$",
|
||||||
|
"^Znanstveno-stručni prilozi$",
|
||||||
|
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
|
||||||
|
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
|
||||||
|
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
|
||||||
|
"^Finanční analýza podniku$",
|
||||||
|
"^Financial analysis( of business)?$",
|
||||||
|
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
|
||||||
|
"^Jikken nihon shūshinsho$",
|
||||||
|
"(?i)^CORONER('|s)(s|') INQUESTS$",
|
||||||
|
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
|
||||||
|
"(?i)^Consultants' contract(s)?$",
|
||||||
|
"(?i)^Upute autorima$",
|
||||||
|
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
|
||||||
|
"^Joshi shin kokubun$",
|
||||||
|
"^Kōtō shōgaku dokuhon nōson'yō$",
|
||||||
|
"^Jinjō shōgaku shōka$",
|
||||||
|
"^Shōgaku shūjichō$",
|
||||||
|
"^Nihon joshi dokuhon$",
|
||||||
|
"^Joshi shin dokuhon$",
|
||||||
|
"^Chūtō kanbun dokuhon$",
|
||||||
|
"^Wabun dokuhon$",
|
||||||
|
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
|
||||||
|
"(?i)^cardiac rehabilitation$",
|
||||||
|
"(?i)^Analytical summary$",
|
||||||
|
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
|
||||||
|
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
|
||||||
|
"^Prikazi i osvrti$",
|
||||||
|
"^Rodinný dům s provozovnou$",
|
||||||
|
"^Family house with an establishment$",
|
||||||
|
"^Shinsei chūtō shin kokugun$",
|
||||||
|
"^Pulmonary alveolar proteinosis(\\.?)$",
|
||||||
|
"^Shinshū kanbun$",
|
||||||
|
"^Viñeta(s?) de Rodríguez$",
|
||||||
|
"(?i)^RUBRIKA UREDNIKA$",
|
||||||
|
"^A Matching Model of the Academic Publication Market$",
|
||||||
|
"^Yōgaku kōyō$",
|
||||||
|
"^Internetový marketing$",
|
||||||
|
"^Internet marketing$",
|
||||||
|
"^Chūtō kokugo dokuhon$",
|
||||||
|
"^Kokugo dokuhon$",
|
||||||
|
"^Antibiotic Cover for Dental Extraction(s?)$",
|
||||||
|
"^Strategie podniku$",
|
||||||
|
"^Strategy of an Enterprise$",
|
||||||
|
"(?i)^respiratory disease(s?)(\\.?)$",
|
||||||
|
"^Award(s?) for Gallantry in Civil Defence$",
|
||||||
|
"^Podniková kultura$",
|
||||||
|
"^Corporate Culture$",
|
||||||
|
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
|
||||||
|
"^Pracovní motivace$",
|
||||||
|
"^Work Motivation$",
|
||||||
|
"^Kaitei kōtō jogaku dokuhon$",
|
||||||
|
"^Konsolidovaná účetní závěrka$",
|
||||||
|
"^Consolidated Financial Statements$",
|
||||||
|
"(?i)^intracranial tumour(s?)$",
|
||||||
|
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
|
||||||
|
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
|
||||||
|
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
|
||||||
|
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
|
||||||
|
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
|
||||||
|
"^The level of motivation process as a leadership$",
|
||||||
|
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
|
||||||
|
"(?i)^news and events$",
|
||||||
|
"(?i)^NOVOSTI I DOGAĐAJI$",
|
||||||
|
"^Sansū no gakushū$",
|
||||||
|
"^Posouzení informačního systému firmy a návrh změn$",
|
||||||
|
"^Information System Assessment and Proposal for ICT Modification$",
|
||||||
|
"^Stresové zatížení pracovníků ve vybrané profesi$",
|
||||||
|
"^Stress load in a specific job$",
|
||||||
|
"^Sunday: Poster Sessions, Pt.*$",
|
||||||
|
"^Monday: Poster Sessions, Pt.*$",
|
||||||
|
"^Wednesday: Poster Sessions, Pt.*",
|
||||||
|
"^Tuesday: Poster Sessions, Pt.*$",
|
||||||
|
"^Analýza reklamy$",
|
||||||
|
"^Analysis of advertising$",
|
||||||
|
"^Shōgaku shūshinsho$",
|
||||||
|
"^Shōgaku sansū$",
|
||||||
|
"^Shintei joshi kokubun$",
|
||||||
|
"^Taishō joshi kokubun dokuhon$",
|
||||||
|
"^Joshi kokubun$",
|
||||||
|
"^Účetní uzávěrka a účetní závěrka v ČR$",
|
||||||
|
"(?i)^The \"?Causes\"? of Cancer$",
|
||||||
|
"^Normas para la publicación de artículos$",
|
||||||
|
"^Editor('|s)(s|') [Rr]eply$",
|
||||||
|
"^Editor(’|s)(s|’) letter$",
|
||||||
|
"^Redaktoriaus žodis$",
|
||||||
|
"^DISCUSSION ON THE PRECEDING PAPER$",
|
||||||
|
"^Kōtō shōgaku shūshinsho jidōyō$",
|
||||||
|
"^Shōgaku nihon rekishi$",
|
||||||
|
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
|
||||||
|
"^Préface$",
|
||||||
|
"^Occupational [Hh]ealth [Ss]ervices.$",
|
||||||
|
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
|
||||||
|
"^Účetní závěrka ve vybraném podniku.*$",
|
||||||
|
"^Financial statements in selected company$",
|
||||||
|
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
|
||||||
|
"^Pseudomyxoma peritonei$",
|
||||||
|
"^Kazalo autora$",
|
||||||
|
"(?i)^uvodna riječ$",
|
||||||
|
"^Motivace jako způsob vedení lidí$",
|
||||||
|
"^Motivation as a leadership$",
|
||||||
|
"^Polyfunkční dům$",
|
||||||
|
"^Multi\\-funkcional building$",
|
||||||
|
"^Podnikatelský plán$",
|
||||||
|
"(?i)^Podnikatelský záměr$",
|
||||||
|
"(?i)^Business Plan$",
|
||||||
|
"^Oceňování nemovitostí$",
|
||||||
|
"^Marketingová komunikace$",
|
||||||
|
"^Marketing communication$",
|
||||||
|
"^Sumario Analítico$",
|
||||||
|
"^Riječ uredništva$",
|
||||||
|
"^Savjetovanja i priredbe$",
|
||||||
|
"^Índice$",
|
||||||
|
"^(Starobosanski nadpisi).*$",
|
||||||
|
"^Vzdělávání pracovníků v organizaci$",
|
||||||
|
"^Staff training in organization$",
|
||||||
|
"^(Life Histories of North American Geometridae).*$",
|
||||||
|
"^Strategická analýza podniku$",
|
||||||
|
"^Strategic Analysis of an Enterprise$",
|
||||||
|
"^Sadržaj$",
|
||||||
|
"^Upute suradnicima$",
|
||||||
|
"^Rodinný dům$",
|
||||||
|
"(?i)^Fami(l)?ly house$",
|
||||||
|
"^Upute autorima$",
|
||||||
|
"^Strategic Analysis$",
|
||||||
|
"^Finanční analýza vybraného podniku$",
|
||||||
|
"^Finanční analýza$",
|
||||||
|
"^Riječ urednika$",
|
||||||
|
"(?i)^Content(s?)$",
|
||||||
|
"(?i)^Inhalt$",
|
||||||
|
"^Jinjō shōgaku shūshinsho jidōyō$",
|
||||||
|
"(?i)^Index$",
|
||||||
|
"^Chūgaku kokubun kyōkasho$",
|
||||||
|
"^Retrato de una mujer$",
|
||||||
|
"^Retrato de un hombre$",
|
||||||
|
"^Kōtō shōgaku dokuhon$",
|
||||||
|
"^Shotōka kokugo$",
|
||||||
|
"^Shōgaku dokuhon$",
|
||||||
|
"^Jinjō shōgaku kokugo dokuhon$",
|
||||||
|
"^Shinsei kokugo dokuhon$",
|
||||||
|
"^Teikoku dokuhon$",
|
||||||
|
"^Instructions to Authors$",
|
||||||
|
"^KİTAP TAHLİLİ$",
|
||||||
|
"^PRZEGLĄD PIŚMIENNICTWA$",
|
||||||
|
"(?i)^Presentación$",
|
||||||
|
"^İçindekiler$",
|
||||||
|
"(?i)^Tabl?e of contents$",
|
||||||
|
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
|
||||||
|
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
|
||||||
|
"^Editorial( Board)?$",
|
||||||
|
"(?i)^Editorial \\(English\\)$",
|
||||||
|
"^Editörden$",
|
||||||
|
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
|
||||||
|
"^(Kiri Karl Morgensternile).*$",
|
||||||
|
"^(\\[Eksliibris Aleksandr).*\\]$",
|
||||||
|
"^(\\[Eksliibris Aleksandr).*$",
|
||||||
|
"^(Eksliibris Aleksandr).*$",
|
||||||
|
"^(Kiri A\\. de Vignolles).*$",
|
||||||
|
"^(2 kirja Karl Morgensternile).*$",
|
||||||
|
"^(Pirita kloostri idaosa arheoloogilised).*$",
|
||||||
|
"^(Kiri tundmatule).*$",
|
||||||
|
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
|
||||||
|
"^(Eksliibris Nikolai Birukovile).*$",
|
||||||
|
"^(Eksliibris Nikolai Issakovile).*$",
|
||||||
|
"^(WHP Cruise Summary Information of section).*$",
|
||||||
|
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
|
||||||
|
"^(Measurement of the spin\\-dependent structure function).*",
|
||||||
|
"(?i)^.*authors['’′]? reply\\.?$",
|
||||||
|
"(?i)^.*authors['’′]? response\\.?$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"synonyms": {}
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -73,7 +73,7 @@ Workflow definition requirements
|
||||||
|
|
||||||
This property can be set using maven `-D` switch.
|
This property can be set using maven `-D` switch.
|
||||||
|
|
||||||
`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value.
|
`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value.
|
||||||
|
|
||||||
Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory.
|
Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory.
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
<module>dhp-aggregation</module>
|
<module>dhp-aggregation</module>
|
||||||
<module>dhp-distcp</module>
|
<module>dhp-distcp</module>
|
||||||
<module>dhp-graph-mapper</module>
|
<module>dhp-graph-mapper</module>
|
||||||
|
<module>dhp-dedup</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<pluginRepositories>
|
<pluginRepositories>
|
||||||
|
@ -310,6 +311,7 @@
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
|
||||||
|
|
||||||
<plugin>
|
<plugin>
|
||||||
<!-- this plugin prepares oozie installer package-->
|
<!-- this plugin prepares oozie installer package-->
|
||||||
|
|
||||||
|
@ -523,6 +525,7 @@
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-failsafe-plugin</artifactId>
|
<artifactId>maven-failsafe-plugin</artifactId>
|
||||||
|
<version>${maven.failsave.plugin.version}</version>
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>default-integration-test</id>
|
<id>default-integration-test</id>
|
||||||
|
|
81
pom.xml
81
pom.xml
|
@ -114,6 +114,12 @@
|
||||||
<version>${dhp.spark.version}</version>
|
<version>${dhp.spark.version}</version>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-graphx_2.11</artifactId>
|
||||||
|
<version>${dhp.spark.version}</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
|
@ -177,6 +183,7 @@
|
||||||
<version>${dhp.jackson.version}</version>
|
<version>${dhp.jackson.version}</version>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-annotations</artifactId>
|
<artifactId>jackson-annotations</artifactId>
|
||||||
|
@ -190,6 +197,12 @@
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>dnet-pace-core</artifactId>
|
||||||
|
<version>4.0.0-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>javax.persistence</groupId>
|
<groupId>javax.persistence</groupId>
|
||||||
|
@ -203,6 +216,21 @@
|
||||||
<artifactId>amqp-client</artifactId>
|
<artifactId>amqp-client</artifactId>
|
||||||
<version>5.6.0</version>
|
<version>5.6.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
|
<artifactId>json-path</artifactId>
|
||||||
|
<version>2.4.0</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.arakelian</groupId>
|
||||||
|
<artifactId>java-jq</artifactId>
|
||||||
|
<version>0.10.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>edu.cmu</groupId>
|
||||||
|
<artifactId>secondstring</artifactId>
|
||||||
|
<version>1.0.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.oozie</groupId>
|
<groupId>org.apache.oozie</groupId>
|
||||||
|
@ -230,7 +258,7 @@
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
<version>3.6.0</version>
|
<version>${maven.compiler.plugin.version}</version>
|
||||||
<configuration>
|
<configuration>
|
||||||
<source>1.8</source>
|
<source>1.8</source>
|
||||||
<target>1.8</target>
|
<target>1.8</target>
|
||||||
|
@ -259,27 +287,6 @@
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>eu.dnetlib</groupId>
|
|
||||||
<artifactId>protoc-jar-maven-plugin</artifactId>
|
|
||||||
<version>1.1.0</version>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<phase>generate-sources</phase>
|
|
||||||
<goals>
|
|
||||||
<goal>run</goal>
|
|
||||||
</goals>
|
|
||||||
<configuration>
|
|
||||||
<protocVersion>${google.protobuf.version}</protocVersion>
|
|
||||||
<inputDirectories>
|
|
||||||
<include>src/main/resources</include>
|
|
||||||
</inputDirectories>
|
|
||||||
<outputDirectory>src/gen/java</outputDirectory>
|
|
||||||
</configuration>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-surefire-plugin</artifactId>
|
<artifactId>maven-surefire-plugin</artifactId>
|
||||||
|
@ -342,6 +349,31 @@
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>net.alchim31.maven</groupId>
|
||||||
|
<artifactId>scala-maven-plugin</artifactId>
|
||||||
|
<version>4.0.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>scala-compile-first</id>
|
||||||
|
<phase>initialize</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
<goal>compile</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>scala-test-compile</id>
|
||||||
|
<phase>process-test-resources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>testCompile</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
<configuration>
|
||||||
|
<scalaVersion>${scala.version}</scalaVersion>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
|
|
||||||
<extensions>
|
<extensions>
|
||||||
|
@ -380,14 +412,15 @@
|
||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||||
|
<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
|
||||||
|
<maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
|
||||||
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
||||||
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
|
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
|
||||||
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
|
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
|
||||||
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
|
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
|
||||||
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
||||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||||
<scala.version>2.11.8</scala.version>
|
<scala.version>2.11.12</scala.version>
|
||||||
</properties>
|
</properties>
|
||||||
</project>
|
</project>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue