forked from D-Net/dnet-hadoop
aligned with origin/master, aligned model and mapping
This commit is contained in:
commit
176c5606bd
|
@ -1,6 +1,8 @@
|
|||
.DS_Store
|
||||
.idea
|
||||
*.iml
|
||||
*.ipr
|
||||
*.iws
|
||||
*~
|
||||
.classpath
|
||||
/*/.classpath
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
|
@ -1,118 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Entity implements Serializable {
|
||||
|
||||
private String identifier;
|
||||
|
||||
private List<Pid> pid;
|
||||
|
||||
private List<String> title;
|
||||
|
||||
private List<String> date;
|
||||
|
||||
private String typology;
|
||||
|
||||
private List<String> authors;
|
||||
|
||||
private List<Subject> subject;
|
||||
|
||||
private String description;
|
||||
|
||||
private String completionStatus;
|
||||
|
||||
private List<Provenance> collectedFrom;
|
||||
|
||||
private List<String> publisher;
|
||||
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
public List<Pid> getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(List<Pid> pid) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
public List<String> getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(List<String> title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public List<String> getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(List<String> date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public String getTypology() {
|
||||
return typology;
|
||||
}
|
||||
|
||||
public void setTypology(String typology) {
|
||||
this.typology = typology;
|
||||
}
|
||||
|
||||
public List<String> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<String> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
public List<Subject> getSubject() {
|
||||
return subject;
|
||||
}
|
||||
|
||||
public void setSubject(List<Subject> subject) {
|
||||
this.subject = subject;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public List<Provenance> getCollectedFrom() {
|
||||
return collectedFrom;
|
||||
}
|
||||
|
||||
public void setCollectedFrom(List<Provenance> collectedFrom) {
|
||||
this.collectedFrom = collectedFrom;
|
||||
}
|
||||
|
||||
public List<String> getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(List<String> publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class Pid {
|
||||
|
||||
private String pid;
|
||||
|
||||
private String pidType;
|
||||
|
||||
public String getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(String pid) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
public String getPidType() {
|
||||
return pidType;
|
||||
}
|
||||
|
||||
public void setPidType(String pidType) {
|
||||
this.pidType = pidType;
|
||||
}
|
||||
|
||||
public String generateId() {
|
||||
if(StringUtils.isEmpty(pid) || StringUtils.isEmpty(pidType))
|
||||
return null;
|
||||
return DHPUtils.md5(String.format("%s::%s", pid, pidType));
|
||||
}
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
public class Provenance {
|
||||
|
||||
private String datasourceId;
|
||||
|
||||
private String datasourceName;
|
||||
|
||||
private String completionStatus;
|
||||
|
||||
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
public void setDatasourceId(String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
public String getDatasourceName() {
|
||||
return datasourceName;
|
||||
}
|
||||
|
||||
public void setDatasourceName(String datasourceName) {
|
||||
this.datasourceName = datasourceName;
|
||||
}
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
}
|
|
@ -1,52 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Relation implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -9103706796710618813L;
|
||||
|
||||
private String source;
|
||||
|
||||
private String target;
|
||||
|
||||
private List<Provenance> provenance;
|
||||
|
||||
private RelationSemantic semantic;
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public void setSource(final String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public String getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
public void setTarget(final String target) {
|
||||
this.target = target;
|
||||
}
|
||||
|
||||
public List<Provenance> getProvenance() {
|
||||
return provenance;
|
||||
}
|
||||
|
||||
public void setProvenance(final List<Provenance> provenance) {
|
||||
this.provenance = provenance;
|
||||
}
|
||||
|
||||
public RelationSemantic getSemantic() {
|
||||
return semantic;
|
||||
}
|
||||
|
||||
public void setSemantic(final RelationSemantic semantic) {
|
||||
this.semantic = semantic;
|
||||
}
|
||||
}
|
|
@ -1,16 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class RelationSemantic extends Subject implements Serializable {
|
||||
|
||||
public String inverse;
|
||||
|
||||
public String getInverse() {
|
||||
return inverse;
|
||||
}
|
||||
|
||||
public void setInverse(String inverse) {
|
||||
this.inverse = inverse;
|
||||
}
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Subject implements Serializable {
|
||||
|
||||
private String schema;
|
||||
|
||||
private String value;
|
||||
|
||||
public Subject() {
|
||||
|
||||
}
|
||||
|
||||
public Subject(String schema, String value) {
|
||||
this.schema = schema;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public String getSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
public void setSchema(String schema) {
|
||||
this.schema = schema;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
public class Country extends Qualifier {
|
||||
|
||||
private DataInfo dataInfo;
|
||||
|
||||
public DataInfo getDataInfo() {
|
||||
return dataInfo;
|
||||
}
|
||||
|
||||
public void setDataInfo(DataInfo dataInfo) {
|
||||
this.dataInfo = dataInfo;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Instance implements Serializable {
|
||||
|
||||
|
@ -12,7 +13,7 @@ public class Instance implements Serializable {
|
|||
|
||||
private KeyValue hostedby;
|
||||
|
||||
private String url;
|
||||
private List<String> url;
|
||||
|
||||
// other research products specifc
|
||||
private String distributionlocation;
|
||||
|
@ -21,6 +22,14 @@ public class Instance implements Serializable {
|
|||
|
||||
private Field<String> dateofacceptance;
|
||||
|
||||
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargeamount;
|
||||
|
||||
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargecurrency;
|
||||
|
||||
private Field<String> refereed; //peer-review status
|
||||
|
||||
public Field<String> getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
@ -53,11 +62,11 @@ public class Instance implements Serializable {
|
|||
this.hostedby = hostedby;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
public List<String> getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
public void setUrl(List<String> url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
|
@ -85,7 +94,29 @@ public class Instance implements Serializable {
|
|||
this.dateofacceptance = dateofacceptance;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargeamount() {
|
||||
return processingchargeamount;
|
||||
}
|
||||
|
||||
public void setProcessingchargeamount(Field<String> processingchargeamount) {
|
||||
this.processingchargeamount = processingchargeamount;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargecurrency() {
|
||||
return processingchargecurrency;
|
||||
}
|
||||
|
||||
public void setProcessingchargecurrency(Field<String> processingchargecurrency) {
|
||||
this.processingchargecurrency = processingchargecurrency;
|
||||
}
|
||||
|
||||
public Field<String> getRefereed() {
|
||||
return refereed;
|
||||
}
|
||||
|
||||
public void setRefereed(Field<String> refereed) {
|
||||
this.refereed = refereed;
|
||||
}
|
||||
|
||||
public String toComparableString(){
|
||||
return String.format("%s::%s::%s::%s",
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
public abstract class Result extends OafEntity implements Serializable {
|
||||
|
||||
|
@ -16,7 +14,7 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
// common fields
|
||||
private Qualifier language;
|
||||
|
||||
private List<Qualifier> country;
|
||||
private List<Country> country;
|
||||
|
||||
private List<StructuredProperty> subject;
|
||||
|
||||
|
@ -44,16 +42,10 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
|
||||
private List<Field<String>> coverage;
|
||||
|
||||
private Field<String> refereed; //peer-review status
|
||||
private Qualifier bestaccessright;
|
||||
|
||||
private List<Context> context;
|
||||
|
||||
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargeamount;
|
||||
|
||||
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargecurrency;
|
||||
|
||||
private List<ExternalReference> externalReference;
|
||||
|
||||
private List<Instance> instance;
|
||||
|
@ -82,11 +74,11 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.language = language;
|
||||
}
|
||||
|
||||
public List<Qualifier> getCountry() {
|
||||
public List<Country> getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public void setCountry(List<Qualifier> country) {
|
||||
public void setCountry(List<Country> country) {
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
|
@ -194,12 +186,12 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.coverage = coverage;
|
||||
}
|
||||
|
||||
public Field<String> getRefereed() {
|
||||
return refereed;
|
||||
public Qualifier getBestaccessright() {
|
||||
return bestaccessright;
|
||||
}
|
||||
|
||||
public void setRefereed(Field<String> refereed) {
|
||||
this.refereed = refereed;
|
||||
public void setBestaccessright(Qualifier bestaccessright) {
|
||||
this.bestaccessright = bestaccessright;
|
||||
}
|
||||
|
||||
public List<Context> getContext() {
|
||||
|
@ -226,24 +218,6 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.instance = instance;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargeamount() {
|
||||
return processingchargeamount;
|
||||
}
|
||||
|
||||
public Result setProcessingchargeamount(Field<String> processingchargeamount) {
|
||||
this.processingchargeamount = processingchargeamount;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargecurrency() {
|
||||
return processingchargecurrency;
|
||||
}
|
||||
|
||||
public Result setProcessingchargecurrency(Field<String> processingchargecurrency) {
|
||||
this.processingchargecurrency = processingchargecurrency;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mergeFrom(OafEntity e) {
|
||||
super.mergeFrom(e);
|
||||
|
@ -287,19 +261,9 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
|
||||
coverage = mergeLists(coverage, r.getCoverage());
|
||||
|
||||
if (r.getRefereed() != null && compareTrust(this, r) < 0)
|
||||
refereed = r.getRefereed();
|
||||
|
||||
context = mergeLists(context, r.getContext());
|
||||
|
||||
if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0)
|
||||
processingchargeamount = r.getProcessingchargeamount();
|
||||
|
||||
if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0)
|
||||
processingchargecurrency = r.getProcessingchargecurrency();
|
||||
|
||||
externalReference = mergeLists(externalReference, r.getExternalReference());
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -314,5 +278,4 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
return a.size() > b.size() ? a : b;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
|
|
|
@ -262,12 +262,9 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
r.setContributor(prepareContributors(doc, info));
|
||||
r.setResourcetype(prepareResourceType(doc, info));
|
||||
r.setCoverage(prepareCoverages(doc, info));
|
||||
r.setRefereed(null); // NOT PRESENT IN MDSTORES
|
||||
r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
|
||||
r.setProcessingchargeamount(null); // NOT PRESENT IN MDSTORES
|
||||
r.setProcessingchargecurrency(null); // NOT PRESENT IN MDSTORES
|
||||
}
|
||||
|
||||
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
||||
|
|
|
@ -104,7 +104,7 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
|
|||
final String url = ((Node) o).getText().trim();
|
||||
if (url.startsWith("http")) {
|
||||
final Instance instance = new Instance();
|
||||
instance.setUrl(url);
|
||||
instance.setUrl(Arrays.asList(url));
|
||||
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
|
||||
instance.setCollectedfrom(collectedfrom);
|
||||
instance.setHostedby(hostedby);
|
||||
|
@ -112,6 +112,9 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
|
|||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
res.add(instance);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -74,7 +74,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
final List<Instance> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
||||
final Instance instance = new Instance();
|
||||
instance.setUrl(((Node) o).getText().trim());
|
||||
instance.setUrl(Arrays.asList(((Node) o).getText().trim()));
|
||||
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
|
||||
instance.setCollectedfrom(collectedfrom);
|
||||
instance.setHostedby(hostedby);
|
||||
|
@ -82,6 +82,9 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
res.add(instance);
|
||||
}
|
||||
return res;
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package eu.dnetlib.dedup;
|
||||
|
||||
import eu.dnetlib.dedup.graph.ConnectedComponent;
|
||||
import eu.dnetlib.dedup.graph.GraphProcessor;
|
||||
import com.google.common.hash.Hashing;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
@ -12,8 +11,6 @@ import org.apache.spark.api.java.JavaPairRDD;
|
|||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.graphx.Edge;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
@ -44,30 +41,23 @@ public class SparkCreateSimRels {
|
|||
final String inputPath = parser.get("sourcePath");
|
||||
final String entity = parser.get("entity");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
|
||||
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||
|
||||
|
||||
final long total = sc.textFile(inputPath + "/" + entity).count();
|
||||
|
||||
JavaPairRDD<Object, MapDocument> vertexes = sc.textFile(inputPath + "/" + entity)
|
||||
.map(s->{
|
||||
JavaPairRDD<String, MapDocument> mapDocument = sc.textFile(inputPath + "/" + entity)
|
||||
.mapToPair(s->{
|
||||
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s);
|
||||
return new Tuple2<>(d.getIdentifier(), d);})
|
||||
.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t -> new Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2()));
|
||||
|
||||
|
||||
|
||||
|
||||
JavaPairRDD<String, MapDocument> mapDocument = vertexes.mapToPair((PairFunction<Tuple2<Object, MapDocument>, String, MapDocument>) item -> new Tuple2<String, MapDocument>(item._2().getIdentifier(), item._2()));
|
||||
return new Tuple2<>(d.getIdentifier(), d);});
|
||||
|
||||
//create blocks for deduplication
|
||||
JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc,mapDocument, dedupConf);
|
||||
|
||||
JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
|
||||
// JavaPairRDD<String, Iterable<MapDocument>> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf);
|
||||
|
||||
//create relations by comparing only elements in the same group
|
||||
final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
|
||||
|
||||
// final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations(sc, blocks, dedupConf);
|
||||
|
||||
final JavaRDD<Relation> isSimilarToRDD = dedupRels.map(simRel -> {
|
||||
final Relation r = new Relation();
|
||||
|
@ -79,17 +69,5 @@ public class SparkCreateSimRels {
|
|||
|
||||
spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity));
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -18,11 +18,11 @@ import java.util.List;
|
|||
public class SparkCreateDedupTest {
|
||||
|
||||
String configuration;
|
||||
String entity = "publication";
|
||||
String entity = "organization";
|
||||
|
||||
@Before
|
||||
public void setUp() throws IOException {
|
||||
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json"));
|
||||
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
|
||||
|
||||
}
|
||||
|
||||
|
@ -31,7 +31,7 @@ public class SparkCreateDedupTest {
|
|||
public void createSimRelsTest() throws Exception {
|
||||
SparkCreateSimRels.main(new String[] {
|
||||
"-mt", "local[*]",
|
||||
"-s", "/home/sandro/betadump",
|
||||
"-s", "/Users/miconis/dumps",
|
||||
"-e", entity,
|
||||
"-c", ArgumentApplicationParser.compressArgument(configuration),
|
||||
"-t", "/tmp/dedup",
|
||||
|
@ -44,7 +44,7 @@ public class SparkCreateDedupTest {
|
|||
|
||||
SparkCreateConnectedComponent.main(new String[] {
|
||||
"-mt", "local[*]",
|
||||
"-s", "/home/sandro/betadump",
|
||||
"-s", "/Users/miconis/dumps",
|
||||
"-e", entity,
|
||||
"-c", ArgumentApplicationParser.compressArgument(configuration),
|
||||
"-t", "/tmp/dedup",
|
||||
|
@ -56,7 +56,7 @@ public class SparkCreateDedupTest {
|
|||
public void dedupRecordTest() throws Exception {
|
||||
SparkCreateDedupRecord.main(new String[] {
|
||||
"-mt", "local[*]",
|
||||
"-s", "/home/sandro/betadump",
|
||||
"-s", "/Users/miconis/dumps",
|
||||
"-e", entity,
|
||||
"-c", ArgumentApplicationParser.compressArgument(configuration),
|
||||
"-d", "/tmp/dedup",
|
||||
|
@ -64,26 +64,10 @@ public class SparkCreateDedupTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void printCC() throws Exception {
|
||||
public void printConfiguration() throws Exception {
|
||||
System.out.println(ArgumentApplicationParser.compressArgument(configuration));
|
||||
}
|
||||
|
||||
|
||||
// [20|grid________::6031f94bef015a37783268ec1e75f17f, 20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46]
|
||||
// [20|grid________::672e1e5cef49e68f124d3da5225a7357, 20|grid________::7a402604c3853c7a0af14f88f56bf7e1]
|
||||
// [20|grid________::2fc05b35e11d915b220a66356053eae2, 20|grid________::b02fb3176eb38f6c572722550c07e7ab]
|
||||
// [20|grid________::bc86248ab2b8d7955dcaf592ba342262, 20|corda_______::45a8ec964029278fb938805182e247a8]
|
||||
// [20|doajarticles::74551f800ad1c81a6cd31c5162887b7f, 20|rcuk________::86dc9a83df05a58917f38ca09f814617]
|
||||
// [20|nsf_________::5e837d8e6444cc298db314ea54ad2f4a, 20|snsf________::7b54715f0ec5c6a0a44672f45d98be8d]
|
||||
// [20|corda__h2020::7ee7e57bad06b92c1a568dd61e10ba8c, 20|snsf________::2d4a2695221a3ce0c749ee34e064c0b3]
|
||||
// [20|corda_______::25220a523550176dac9e5432dac43596, 20|grid________::9782f16a46650cbbfaaa2315109507d1]
|
||||
// [20|nih_________::88c3b664dcc7af9e827f94ac964cd66c, 20|grid________::238d3ac0a7d119d5c8342a647f5245f5]
|
||||
// [20|rcuk________::0582c20fcfb270f9ec1b19b0f0dcd881, 20|nsf_________::9afa48ddf0bc2cd4f3c41dc41daabcdb]
|
||||
// [20|rcuk________::fbc445f8d24e569bc8b640dba86ae978, 20|corda_______::5a8a4094f1b68a88fc56e65cea7ebfa0]
|
||||
// [20|rcuk________::7485257cd5caaf6316ba8062feea801d, 20|grid________::dded811e5f5a4c9f7ca8f9955e52ade7]
|
||||
// [20|nih_________::0576dd270d29d5b7c23dd15a827ccdb9, 20|corda_______::10ca69f6a4a121f75fdde1feee226ce0]
|
||||
// [20|corda__h2020::0429f6addf10e9b2939d65c6fb097ffd, 20|grid________::6563ec73057624d5ccc0cd050b302181]
|
||||
|
||||
@Test
|
||||
public void testHashCode() {
|
||||
final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f";
|
||||
|
|
|
@ -7,9 +7,9 @@
|
|||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
"idPath":"$.id",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true",
|
||||
"idPath": "$.id",
|
||||
"maxIterations": "20"
|
||||
},
|
||||
"pace" : {
|
||||
|
@ -31,7 +31,7 @@
|
|||
}
|
||||
],
|
||||
"threshold": 1,
|
||||
"aggregation": "SC",
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer2",
|
||||
|
@ -52,10 +52,24 @@
|
|||
"weight": 1,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "legalname",
|
||||
"comparator": "numbersMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "legalname",
|
||||
"comparator": "romansMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1,
|
||||
"aggregation": "NC",
|
||||
"aggregation": "AND",
|
||||
"positive": "layer3",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer3",
|
||||
|
@ -69,12 +83,11 @@
|
|||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {
|
||||
"windowSize": "4",
|
||||
"threshold": "0.0"
|
||||
"windowSize": "4"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"threshold": 0.7,
|
||||
"aggregation": "W_MEAN",
|
||||
"positive": "layer4",
|
||||
"negative": "NO_MATCH",
|
||||
|
@ -87,19 +100,18 @@
|
|||
"field": "legalname",
|
||||
"comparator": "keywordMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"countIfUndefined": "true",
|
||||
"params": {
|
||||
"windowSize": "4",
|
||||
"threshold": "0.7"
|
||||
"windowSize": "4"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "W_MEAN",
|
||||
"threshold": 0.9,
|
||||
"aggregation": "AVG",
|
||||
"positive": "layer5",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer5",
|
||||
"ignoreUndefined": "false"
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer5": {
|
||||
"fields": [
|
||||
|
@ -133,19 +145,20 @@
|
|||
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"}
|
||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"},
|
||||
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
||||
"key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
|
||||
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
|
||||
"key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
|
||||
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
|
||||
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
||||
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
||||
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
||||
|
@ -233,7 +246,7 @@
|
|||
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
|
||||
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
|
||||
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
|
||||
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
|
||||
"key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
|
||||
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
|
||||
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
|
||||
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
|
||||
|
@ -243,7 +256,11 @@
|
|||
"key::102": ["informatics","informatica","informática","informática","informatica",""],
|
||||
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
||||
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
|
||||
"key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"]
|
||||
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
|
||||
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
|
||||
"key::107" : ["agricultural forestry", "af", "a f"],
|
||||
"key::108" : ["agricultural mechanical", "am", "a m"],
|
||||
"key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -66,14 +66,13 @@
|
|||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"threshold": "0.5",
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"threshold": 0.5,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "layer2",
|
||||
"undefined": "layer2",
|
||||
|
@ -97,7 +96,7 @@
|
|||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "NC",
|
||||
"aggregation": "AND",
|
||||
"positive": "layer3",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer3",
|
||||
|
@ -114,7 +113,7 @@
|
|||
}
|
||||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "SUM",
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
Loading…
Reference in New Issue