merge branch with fork master
This commit is contained in:
commit
92e3a52e91
|
@ -6,7 +6,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-build</artifactId>
|
<artifactId>dhp-build</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-build</artifactId>
|
<artifactId>dhp-build</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
|
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-code-style</artifactId>
|
<artifactId>dhp-code-style</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
|
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>dhp-build</artifactId>
|
<artifactId>dhp-build</artifactId>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
<relativePath>../</relativePath>
|
<relativePath>../</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
|
@ -5,7 +5,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
<relativePath>../</relativePath>
|
<relativePath>../</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,69 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.common;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
|
||||||
|
public class LicenseComparator implements Comparator<Qualifier> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(Qualifier left, Qualifier right) {
|
||||||
|
|
||||||
|
if (left == null && right == null)
|
||||||
|
return 0;
|
||||||
|
if (left == null)
|
||||||
|
return 1;
|
||||||
|
if (right == null)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
String lClass = left.getClassid();
|
||||||
|
String rClass = right.getClassid();
|
||||||
|
|
||||||
|
if (lClass.equals(rClass))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (lClass.equals("OPEN SOURCE"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("OPEN SOURCE"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("OPEN"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("OPEN"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("6MONTHS"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("6MONTHS"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("12MONTHS"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("12MONTHS"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("EMBARGO"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("EMBARGO"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("RESTRICTED"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("RESTRICTED"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("CLOSED"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("CLOSED"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("UNKNOWN"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("UNKNOWN"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
// Else (but unlikely), lexicographical ordering will do.
|
||||||
|
return lClass.compareTo(rClass);
|
||||||
|
}
|
||||||
|
}
|
|
@ -8,7 +8,7 @@ public class DataInfo implements Serializable {
|
||||||
|
|
||||||
private Boolean invisible = false;
|
private Boolean invisible = false;
|
||||||
private Boolean inferred;
|
private Boolean inferred;
|
||||||
private Boolean deletedbyinference;
|
private Boolean deletedbyinference = false;
|
||||||
private String trust;
|
private String trust;
|
||||||
private String inferenceprovenance;
|
private String inferenceprovenance;
|
||||||
private Qualifier provenanceaction;
|
private Qualifier provenanceaction;
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class Field<T> implements Serializable {
|
public class Field<T> implements Serializable {
|
||||||
|
|
||||||
|
@ -39,6 +40,6 @@ public class Field<T> implements Serializable {
|
||||||
if (getClass() != obj.getClass())
|
if (getClass() != obj.getClass())
|
||||||
return false;
|
return false;
|
||||||
Field<T> other = (Field<T>) obj;
|
Field<T> other = (Field<T>) obj;
|
||||||
return getValue().equals(other.getValue());
|
return Objects.equals(getValue(), other.getValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,6 +106,7 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
||||||
.stream(lists)
|
.stream(lists)
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.flatMap(List::stream)
|
.flatMap(List::stream)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.distinct()
|
.distinct()
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
|
@ -256,7 +256,25 @@ public class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
subject = mergeLists(subject, r.getSubject());
|
subject = mergeLists(subject, r.getSubject());
|
||||||
|
|
||||||
|
// merge title lists: main title with higher trust and distinct between the others
|
||||||
|
StructuredProperty baseMainTitle = null;
|
||||||
|
if (title != null) {
|
||||||
|
baseMainTitle = getMainTitle(title);
|
||||||
|
title.remove(baseMainTitle);
|
||||||
|
}
|
||||||
|
|
||||||
|
StructuredProperty newMainTitle = null;
|
||||||
|
if (r.getTitle() != null) {
|
||||||
|
newMainTitle = getMainTitle(r.getTitle());
|
||||||
|
r.getTitle().remove(newMainTitle);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (newMainTitle != null && compareTrust(this, r) < 0)
|
||||||
|
baseMainTitle = newMainTitle;
|
||||||
|
|
||||||
title = mergeLists(title, r.getTitle());
|
title = mergeLists(title, r.getTitle());
|
||||||
|
if (title != null && baseMainTitle != null)
|
||||||
|
title.add(baseMainTitle);
|
||||||
|
|
||||||
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
|
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
|
||||||
|
|
||||||
|
@ -306,4 +324,15 @@ public class Result extends OafEntity implements Serializable {
|
||||||
}
|
}
|
||||||
return a.size() > b.size() ? a : b;
|
return a.size() > b.size() ? a : b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private StructuredProperty getMainTitle(List<StructuredProperty> titles) {
|
||||||
|
// need to check if the list of titles contains more than 1 main title? (in that case, we should chose which
|
||||||
|
// main title select in the list)
|
||||||
|
for (StructuredProperty title : titles) {
|
||||||
|
if (title.getQualifier() != null && title.getQualifier().getClassid() != null)
|
||||||
|
if (title.getQualifier().getClassid().equals("main title"))
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>dhp-actionmanager</artifactId>
|
<artifactId>dhp-actionmanager</artifactId>
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>dhp-aggregation</artifactId>
|
<artifactId>dhp-aggregation</artifactId>
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,9 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,6 @@ public class BlackListTest {
|
||||||
spark.stop();
|
spark.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void noRemoveTest() throws Exception {
|
public void noRemoveTest() throws Exception {
|
||||||
SparkRemoveBlacklistedRelationJob
|
SparkRemoveBlacklistedRelationJob
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -57,7 +57,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib</groupId>
|
<groupId>eu.dnetlib</groupId>
|
||||||
<artifactId>dnet-openaire-broker-common</artifactId>
|
<artifactId>dnet-openaire-broker-common</artifactId>
|
||||||
<version>[1.0.0,2.0.0)</version>
|
<version>[2.0.0,3.0.0)</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
|
@ -4,48 +4,45 @@ package eu.dnetlib.dhp.broker.model;
|
||||||
public enum Topic {
|
public enum Topic {
|
||||||
|
|
||||||
// ENRICHMENT MISSING
|
// ENRICHMENT MISSING
|
||||||
ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"),
|
ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), ENRICH_MISSING_ABSTRACT(
|
||||||
ENRICH_MISSING_ABSTRACT("ENRICH/MISSING/ABSTRACT"),
|
"ENRICH/MISSING/ABSTRACT"), ENRICH_MISSING_PUBLICATION_DATE(
|
||||||
ENRICH_MISSING_PUBLICATION_DATE("ENRICH/MISSING/PUBLICATION_DATE"),
|
"ENRICH/MISSING/PUBLICATION_DATE"), ENRICH_MISSING_PID(
|
||||||
ENRICH_MISSING_PID("ENRICH/MISSING/PID"),
|
"ENRICH/MISSING/PID"), ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), ENRICH_MISSING_SOFTWARE(
|
||||||
ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"),
|
"ENRICH/MISSING/SOFTWARE"), ENRICH_MISSING_SUBJECT_MESHEUROPMC(
|
||||||
ENRICH_MISSING_SOFTWARE("ENRICH/MISSING/SOFTWARE"),
|
"ENRICH/MISSING/SUBJECT/MESHEUROPMC"), ENRICH_MISSING_SUBJECT_ARXIV(
|
||||||
ENRICH_MISSING_SUBJECT_MESHEUROPMC("ENRICH/MISSING/SUBJECT/MESHEUROPMC"),
|
"ENRICH/MISSING/SUBJECT/ARXIV"), ENRICH_MISSING_SUBJECT_JEL(
|
||||||
ENRICH_MISSING_SUBJECT_ARXIV("ENRICH/MISSING/SUBJECT/ARXIV"),
|
"ENRICH/MISSING/SUBJECT/JEL"), ENRICH_MISSING_SUBJECT_DDC(
|
||||||
ENRICH_MISSING_SUBJECT_JEL("ENRICH/MISSING/SUBJECT/JEL"),
|
"ENRICH/MISSING/SUBJECT/DDC"), ENRICH_MISSING_SUBJECT_ACM(
|
||||||
ENRICH_MISSING_SUBJECT_DDC("ENRICH/MISSING/SUBJECT/DDC"),
|
"ENRICH/MISSING/SUBJECT/ACM"), ENRICH_MISSING_SUBJECT_RVK(
|
||||||
ENRICH_MISSING_SUBJECT_ACM("ENRICH/MISSING/SUBJECT/ACM"),
|
"ENRICH/MISSING/SUBJECT/RVK"), ENRICH_MISSING_AUTHOR_ORCID(
|
||||||
ENRICH_MISSING_SUBJECT_RVK("ENRICH/MISSING/SUBJECT/RVK"),
|
"ENRICH/MISSING/AUTHOR/ORCID"),
|
||||||
ENRICH_MISSING_AUTHOR_ORCID("ENRICH/MISSING/AUTHOR/ORCID"),
|
|
||||||
|
|
||||||
// ENRICHMENT MORE
|
// ENRICHMENT MORE
|
||||||
ENRICH_MORE_PID("ENRICH/MORE/PID"),
|
ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT(
|
||||||
ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"),
|
"ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT(
|
||||||
ENRICH_MORE_ABSTRACT("ENRICH/MORE/ABSTRACT"),
|
"ENRICH/MORE/PROJECT"), ENRICH_MORE_SOFTWARE("ENRICH/MORE/SOFTWARE"), ENRICH_MORE_SUBJECT_MESHEUROPMC(
|
||||||
ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"),
|
"ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV(
|
||||||
ENRICH_MORE_PROJECT("ENRICH/MORE/PROJECT"),
|
"ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL(
|
||||||
ENRICH_MORE_SUBJECT_MESHEUROPMC("ENRICH/MORE/SUBJECT/MESHEUROPMC"),
|
"ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC(
|
||||||
ENRICH_MORE_SUBJECT_ARXIV("ENRICH/MORE/SUBJECT/ARXIV"),
|
"ENRICH/MORE/SUBJECT/DDC"), ENRICH_MORE_SUBJECT_ACM(
|
||||||
ENRICH_MORE_SUBJECT_JEL("ENRICH/MORE/SUBJECT/JEL"),
|
"ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"),
|
||||||
ENRICH_MORE_SUBJECT_DDC("ENRICH/MORE/SUBJECT/DDC"),
|
|
||||||
ENRICH_MORE_SUBJECT_ACM("ENRICH/MORE/SUBJECT/ACM"),
|
|
||||||
ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"),
|
|
||||||
|
|
||||||
// ADDITION
|
// ADDITION
|
||||||
ADD_BY_PROJECT("ADD/BY_PROJECT"),
|
ADD_BY_PROJECT("ADD/BY_PROJECT"),
|
||||||
|
|
||||||
// OTHER RELS
|
// OTHER RELS
|
||||||
ENRICH_MISSING_PUBLICATION_IS_RELATED_TO("ENRICH/MISSING/PUBLICATION/IS_RELATED_TO"),
|
ENRICH_MISSING_PUBLICATION_IS_RELATED_TO(
|
||||||
ENRICH_MISSING_PUBLICATION_REFERENCES("ENRICH/MISSING/PUBLICATION/REFERENCES"),
|
"ENRICH/MISSING/PUBLICATION/IS_RELATED_TO"), ENRICH_MISSING_PUBLICATION_REFERENCES(
|
||||||
ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY("ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY"),
|
"ENRICH/MISSING/PUBLICATION/REFERENCES"), ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY(
|
||||||
ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO("ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO"),
|
"ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY"), ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO(
|
||||||
ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY("ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY"),
|
"ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO"), ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY(
|
||||||
|
"ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY"),
|
||||||
|
|
||||||
ENRICH_MISSING_DATASET_IS_RELATED_TO("ENRICH/MISSING/DATASET/IS_RELATED_TO"),
|
ENRICH_MISSING_DATASET_IS_RELATED_TO("ENRICH/MISSING/DATASET/IS_RELATED_TO"), ENRICH_MISSING_DATASET_REFERENCES(
|
||||||
ENRICH_MISSING_DATASET_REFERENCES("ENRICH/MISSING/DATASET/REFERENCES"),
|
"ENRICH/MISSING/DATASET/REFERENCES"), ENRICH_MISSING_DATASET_IS_REFERENCED_BY(
|
||||||
ENRICH_MISSING_DATASET_IS_REFERENCED_BY("ENRICH/MISSING/DATASET/IS_REFERENCED_BY"),
|
"ENRICH/MISSING/DATASET/IS_REFERENCED_BY"), ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO(
|
||||||
ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO("ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO"),
|
"ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO"), ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY(
|
||||||
ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY("ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY");
|
"ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY"),;
|
||||||
|
|
||||||
Topic(final String path) {
|
Topic(final String path) {
|
||||||
this.path = path;
|
this.path = path;
|
||||||
|
@ -59,7 +56,9 @@ public enum Topic {
|
||||||
|
|
||||||
public static Topic fromPath(final String path) {
|
public static Topic fromPath(final String path) {
|
||||||
for (final Topic t : Topic.values()) {
|
for (final Topic t : Topic.values()) {
|
||||||
if (t.getPath().equals(path)) { return t; }
|
if (t.getPath().equals(path)) {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,11 +4,14 @@ package eu.dnetlib.dhp.broker.oa;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
@ -27,20 +30,35 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.broker.model.Event;
|
import eu.dnetlib.dhp.broker.model.Event;
|
||||||
import eu.dnetlib.dhp.broker.model.EventFactory;
|
import eu.dnetlib.dhp.broker.model.EventFactory;
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAbstract;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAuthorOrcid;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingOpenAccess;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPid;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingProject;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationDate;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSubject;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreOpenAccess;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMorePid;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSoftware;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSoftware;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
@ -50,23 +68,43 @@ public class GenerateEventsApplication {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
|
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
|
||||||
|
|
||||||
private static final UpdateMatcher<?> enrichMissingAbstract = new EnrichMissingAbstract();
|
// Simple Matchers
|
||||||
private static final UpdateMatcher<?> enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid();
|
private static final UpdateMatcher<Result, ?> enrichMissingAbstract = new EnrichMissingAbstract();
|
||||||
private static final UpdateMatcher<?> enrichMissingOpenAccess = new EnrichMissingOpenAccess();
|
private static final UpdateMatcher<Result, ?> enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid();
|
||||||
private static final UpdateMatcher<?> enrichMissingPid = new EnrichMissingPid();
|
private static final UpdateMatcher<Result, ?> enrichMissingOpenAccess = new EnrichMissingOpenAccess();
|
||||||
private static final UpdateMatcher<?> enrichMissingProject = new EnrichMissingProject();
|
private static final UpdateMatcher<Result, ?> enrichMissingPid = new EnrichMissingPid();
|
||||||
private static final UpdateMatcher<?> enrichMissingPublicationDate = new EnrichMissingPublicationDate();
|
private static final UpdateMatcher<Result, ?> enrichMissingPublicationDate = new EnrichMissingPublicationDate();
|
||||||
private static final UpdateMatcher<?> enrichMissingSubject = new EnrichMissingSubject();
|
private static final UpdateMatcher<Result, ?> enrichMissingSubject = new EnrichMissingSubject();
|
||||||
private static final UpdateMatcher<?> enrichMoreOpenAccess = new EnrichMoreOpenAccess();
|
private static final UpdateMatcher<Result, ?> enrichMoreOpenAccess = new EnrichMoreOpenAccess();
|
||||||
private static final UpdateMatcher<?> enrichMorePid = new EnrichMorePid();
|
private static final UpdateMatcher<Result, ?> enrichMorePid = new EnrichMorePid();
|
||||||
private static final UpdateMatcher<?> enrichMoreSubject = new EnrichMoreSubject();
|
private static final UpdateMatcher<Result, ?> enrichMoreSubject = new EnrichMoreSubject();
|
||||||
|
|
||||||
|
// Advanced matchers
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMissingProject = new EnrichMissingProject();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMoreProject = new EnrichMoreProject();
|
||||||
|
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMissingSoftware = new EnrichMissingSoftware();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMoreSoftware = new EnrichMoreSoftware();
|
||||||
|
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy();
|
||||||
|
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy();
|
||||||
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(GenerateEventsApplication.class
|
.toString(
|
||||||
|
GenerateEventsApplication.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
|
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
@ -82,9 +120,6 @@ public class GenerateEventsApplication {
|
||||||
final String eventsPath = parser.get("eventsPath");
|
final String eventsPath = parser.get("eventsPath");
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
final String resultClassName = parser.get("resultTableName");
|
|
||||||
log.info("resultTableName: {}", resultClassName);
|
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
|
@ -111,17 +146,17 @@ public class GenerateEventsApplication {
|
||||||
final String graphPath,
|
final String graphPath,
|
||||||
final Class<R> resultClazz) {
|
final Class<R> resultClazz) {
|
||||||
|
|
||||||
final Dataset<R> results =
|
final Dataset<R> results = readPath(
|
||||||
readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz)
|
spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz)
|
||||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||||
|
|
||||||
final Dataset<Relation> rels =
|
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||||
readPath(spark, graphPath + "/relation", Relation.class)
|
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||||
.filter(r -> r.getRelClass().equals("TODO")); // TODO mergedIN
|
|
||||||
|
|
||||||
final Column c = null; // TODO
|
final Column c = null; // TODO
|
||||||
|
|
||||||
final Dataset<Row> aa = results.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner")
|
final Dataset<Row> aa = results
|
||||||
|
.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner")
|
||||||
.groupBy(rels.col("target"))
|
.groupBy(rels.col("target"))
|
||||||
.agg(c)
|
.agg(c)
|
||||||
.filter(x -> x.size() > 1)
|
.filter(x -> x.size() > 1)
|
||||||
|
@ -134,7 +169,7 @@ public class GenerateEventsApplication {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Event> generateSimpleEvents(final Result... children) {
|
private List<Event> generateSimpleEvents(final Collection<Result> children) {
|
||||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
for (final Result target : children) {
|
for (final Result target : children) {
|
||||||
|
@ -142,7 +177,6 @@ public class GenerateEventsApplication {
|
||||||
list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children));
|
list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children));
|
||||||
list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children));
|
list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children));
|
||||||
list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children));
|
list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children));
|
||||||
list.addAll(enrichMissingProject.searchUpdatesForRecord(target, children));
|
|
||||||
list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children));
|
list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children));
|
||||||
list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children));
|
list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children));
|
||||||
list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children));
|
list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children));
|
||||||
|
@ -153,6 +187,87 @@ public class GenerateEventsApplication {
|
||||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<Event> generateProjectsEvents(final Collection<Pair<Result, List<Project>>> childrenWithProjects) {
|
||||||
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
|
for (final Pair<Result, List<Project>> target : childrenWithProjects) {
|
||||||
|
list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects));
|
||||||
|
list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects));
|
||||||
|
}
|
||||||
|
|
||||||
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Event> generateSoftwareEvents(final Collection<Pair<Result, List<Software>>> childrenWithSoftwares) {
|
||||||
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
|
for (final Pair<Result, List<Software>> target : childrenWithSoftwares) {
|
||||||
|
list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
|
||||||
|
list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
|
||||||
|
}
|
||||||
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Event> generatePublicationRelatedEvents(final String relType,
|
||||||
|
final Collection<Pair<Result, Map<String, List<Publication>>>> childrenWithRels) {
|
||||||
|
|
||||||
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
|
final List<Pair<Result, List<Publication>>> cleanedChildrens = childrenWithRels
|
||||||
|
.stream()
|
||||||
|
.filter(p -> p.getRight().containsKey(relType))
|
||||||
|
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
|
||||||
|
.filter(p -> p.getRight().size() > 0)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
for (final Pair<Result, List<Publication>> target : cleanedChildrens) {
|
||||||
|
if (relType.equals("isRelatedTo")) {
|
||||||
|
list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("references")) {
|
||||||
|
list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isReferencedBy")) {
|
||||||
|
list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isSupplementedTo")) {
|
||||||
|
list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isSupplementedBy")) {
|
||||||
|
list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Event> generateDatasetRelatedEvents(final String relType,
|
||||||
|
final Collection<Pair<Result, Map<String, List<eu.dnetlib.dhp.schema.oaf.Dataset>>>> childrenWithRels) {
|
||||||
|
|
||||||
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
|
final List<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>> cleanedChildrens = childrenWithRels
|
||||||
|
.stream()
|
||||||
|
.filter(p -> p.getRight().containsKey(relType))
|
||||||
|
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
|
||||||
|
.filter(p -> p.getRight().size() > 0)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
for (final Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>> target : cleanedChildrens) {
|
||||||
|
if (relType.equals("isRelatedTo")) {
|
||||||
|
list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("references")) {
|
||||||
|
list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isReferencedBy")) {
|
||||||
|
list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isSupplementedTo")) {
|
||||||
|
list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isSupplementedBy")) {
|
||||||
|
list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public static <R> Dataset<R> readPath(
|
public static <R> Dataset<R> readPath(
|
||||||
final SparkSession spark,
|
final SparkSession spark,
|
||||||
final String inputPath,
|
final String inputPath,
|
||||||
|
|
|
@ -1,35 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Project;
|
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingProject extends UpdateMatcher<Project> {
|
|
||||||
|
|
||||||
public EnrichMissingProject() {
|
|
||||||
super(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected List<UpdateInfo<Project>> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public UpdateInfo<Project> generateUpdateInfo(final Project highlightValue,
|
|
||||||
final Result source,
|
|
||||||
final Result target) {
|
|
||||||
return new UpdateInfo<>(
|
|
||||||
Topic.ENRICH_MISSING_PROJECT,
|
|
||||||
highlightValue, source, target,
|
|
||||||
(p, prj) -> p.getProjects().add(prj),
|
|
||||||
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -12,9 +12,8 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public abstract class UpdateMatcher<T> {
|
public abstract class UpdateMatcher<K, T> {
|
||||||
|
|
||||||
private final boolean multipleUpdate;
|
private final boolean multipleUpdate;
|
||||||
|
|
||||||
|
@ -22,15 +21,16 @@ public abstract class UpdateMatcher<T> {
|
||||||
this.multipleUpdate = multipleUpdate;
|
this.multipleUpdate = multipleUpdate;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final Result res, final Result... others) {
|
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final K res, final Collection<K> others) {
|
||||||
|
|
||||||
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
||||||
|
|
||||||
for (final Result source : others) {
|
for (final K source : others) {
|
||||||
if (source != res) {
|
if (source != res) {
|
||||||
for (final UpdateInfo<T> info : findUpdates(source, res)) {
|
for (final UpdateInfo<T> info : findUpdates(source, res)) {
|
||||||
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
|
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
|
||||||
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {} else {
|
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
|
||||||
|
} else {
|
||||||
infoMap.put(s, info);
|
infoMap.put(s, info);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -51,11 +51,11 @@ public abstract class UpdateMatcher<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract List<UpdateInfo<T>> findUpdates(Result source, Result target);
|
protected abstract List<UpdateInfo<T>> findUpdates(K source, K target);
|
||||||
|
|
||||||
protected abstract UpdateInfo<T> generateUpdateInfo(final T highlightValue,
|
protected abstract UpdateInfo<T> generateUpdateInfo(final T highlightValue,
|
||||||
final Result source,
|
final K source,
|
||||||
final Result target);
|
final K target);
|
||||||
|
|
||||||
protected static boolean isMissing(final List<Field<String>> list) {
|
protected static boolean isMissing(final List<Field<String>> list) {
|
||||||
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
|
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public abstract class AbstractEnrichMissingDataset
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
|
||||||
|
|
||||||
|
private final Topic topic;
|
||||||
|
|
||||||
|
public AbstractEnrichMissingDataset(final Topic topic) {
|
||||||
|
super(true);
|
||||||
|
this.topic = topic;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(
|
||||||
|
final Pair<Result, List<Dataset>> source,
|
||||||
|
final Pair<Result, List<Dataset>> target) {
|
||||||
|
|
||||||
|
final Set<String> existingDatasets = target
|
||||||
|
.getRight()
|
||||||
|
.stream()
|
||||||
|
.map(Dataset::getId)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getRight()
|
||||||
|
.stream()
|
||||||
|
.filter(d -> !existingDatasets.contains(d.getId()))
|
||||||
|
.map(ConversionUtils::oafDatasetToBrokerDataset)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Dataset highlightValue,
|
||||||
|
final Pair<Result, List<Dataset>> source,
|
||||||
|
final Pair<Result, List<Dataset>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
getTopic(),
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, rel) -> p.getDatasets().add(rel),
|
||||||
|
rel -> rel.getInstances().get(0).getUrl());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Topic getTopic() {
|
||||||
|
return topic;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetIsReferencedBy() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_IS_REFERENCED_BY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetIsRelatedTo() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_IS_RELATED_TO);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetIsSupplementedBy() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetIsSupplementedTo() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetReferences() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_REFERENCES);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingProject
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
|
||||||
|
|
||||||
|
public EnrichMissingProject() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
|
||||||
|
final Pair<Result, List<Project>> target) {
|
||||||
|
// TODO
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Project highlightValue,
|
||||||
|
final Pair<Result, List<Project>> source,
|
||||||
|
final Pair<Result, List<Project>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_PROJECT,
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, prj) -> p.getProjects().add(prj),
|
||||||
|
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
|
||||||
|
|
||||||
|
public EnrichMoreProject() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
|
||||||
|
final Pair<Result, List<Project>> target) {
|
||||||
|
// TODO
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Project highlightValue,
|
||||||
|
final Pair<Result, List<Project>> source,
|
||||||
|
final Pair<Result, List<Project>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MORE_PROJECT,
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, prj) -> p.getProjects().add(prj),
|
||||||
|
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,63 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public abstract class AbstractEnrichMissingPublication
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
|
||||||
|
|
||||||
|
private final Topic topic;
|
||||||
|
|
||||||
|
public AbstractEnrichMissingPublication(final Topic topic) {
|
||||||
|
super(true);
|
||||||
|
this.topic = topic;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
|
||||||
|
final Pair<Result, List<Publication>> source,
|
||||||
|
final Pair<Result, List<Publication>> target) {
|
||||||
|
|
||||||
|
final Set<String> existingPublications = target
|
||||||
|
.getRight()
|
||||||
|
.stream()
|
||||||
|
.map(Publication::getId)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getRight()
|
||||||
|
.stream()
|
||||||
|
.filter(d -> !existingPublications.contains(d.getId()))
|
||||||
|
.map(ConversionUtils::oafPublicationToBrokerPublication)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Publication highlightValue,
|
||||||
|
final Pair<Result, List<Publication>> source,
|
||||||
|
final Pair<Result, List<Publication>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
getTopic(),
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, rel) -> p.getPublications().add(rel),
|
||||||
|
rel -> rel.getInstances().get(0).getUrl());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Topic getTopic() {
|
||||||
|
return topic;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationIsReferencedBy() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationIsRelatedTo() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_IS_RELATED_TO);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationIsSupplementedBy() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationIsSupplementedTo() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationReferences() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_REFERENCES);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,15 +1,16 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class EnrichMissingAbstract extends UpdateMatcher<String> {
|
public class EnrichMissingAbstract extends UpdateMatcher<Result, String> {
|
||||||
|
|
||||||
public EnrichMissingAbstract() {
|
public EnrichMissingAbstract() {
|
||||||
super(false);
|
super(false);
|
||||||
|
@ -24,7 +25,8 @@ public class EnrichMissingAbstract extends UpdateMatcher<String> {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public UpdateInfo<String> generateUpdateInfo(final String highlightValue, final Result source,
|
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
|
||||||
|
final Result source,
|
||||||
final Result target) {
|
final Result target) {
|
||||||
return new UpdateInfo<>(
|
return new UpdateInfo<>(
|
||||||
Topic.ENRICH_MISSING_ABSTRACT,
|
Topic.ENRICH_MISSING_ABSTRACT,
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -7,10 +7,11 @@ import java.util.List;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Pair<String, String>> {
|
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Result, Pair<String, String>> {
|
||||||
|
|
||||||
public EnrichMissingAuthorOrcid() {
|
public EnrichMissingAuthorOrcid() {
|
||||||
super(true);
|
super(true);
|
||||||
|
@ -24,7 +25,8 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher<Pair<String, String>
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||||
final Result source, final Result target) {
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
return new UpdateInfo<>(
|
return new UpdateInfo<>(
|
||||||
Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
||||||
highlightValue, source, target,
|
highlightValue, source, target,
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -7,12 +7,13 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Instance;
|
import eu.dnetlib.broker.objects.Instance;
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> {
|
public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
|
||||||
|
|
||||||
public EnrichMissingOpenAccess() {
|
public EnrichMissingOpenAccess() {
|
||||||
super(true);
|
super(true);
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -7,11 +7,12 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Pid;
|
import eu.dnetlib.broker.objects.Pid;
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class EnrichMissingPid extends UpdateMatcher<Pid> {
|
public class EnrichMissingPid extends UpdateMatcher<Result, Pid> {
|
||||||
|
|
||||||
public EnrichMissingPid() {
|
public EnrichMissingPid() {
|
||||||
super(true);
|
super(true);
|
|
@ -1,15 +1,16 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
|
public class EnrichMissingPublicationDate extends UpdateMatcher<Result, String> {
|
||||||
|
|
||||||
public EnrichMissingPublicationDate() {
|
public EnrichMissingPublicationDate() {
|
||||||
super(false);
|
super(false);
|
|
@ -0,0 +1,42 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
|
||||||
|
public class EnrichMissingSoftware
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
|
||||||
|
|
||||||
|
public EnrichMissingSoftware() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
|
||||||
|
final Pair<Result, List<Software>> source,
|
||||||
|
final Pair<Result, List<Software>> target) {
|
||||||
|
// TODO
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Software highlightValue,
|
||||||
|
final Pair<Result, List<Software>> source,
|
||||||
|
final Pair<Result, List<Software>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_SOFTWARE,
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, s) -> p.getSoftwares().add(s),
|
||||||
|
s -> s.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -8,13 +8,14 @@ import java.util.stream.Collectors;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class EnrichMissingSubject extends UpdateMatcher<Pair<String, String>> {
|
public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, String>> {
|
||||||
|
|
||||||
public EnrichMissingSubject() {
|
public EnrichMissingSubject() {
|
||||||
super(true);
|
super(true);
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -7,12 +7,13 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Instance;
|
import eu.dnetlib.broker.objects.Instance;
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> {
|
public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
|
||||||
|
|
||||||
public EnrichMoreOpenAccess() {
|
public EnrichMoreOpenAccess() {
|
||||||
super(true);
|
super(true);
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -7,11 +7,12 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Pid;
|
import eu.dnetlib.broker.objects.Pid;
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class EnrichMorePid extends UpdateMatcher<Pid> {
|
public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
|
||||||
|
|
||||||
public EnrichMorePid() {
|
public EnrichMorePid() {
|
||||||
super(true);
|
super(true);
|
|
@ -0,0 +1,42 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
|
||||||
|
public class EnrichMoreSoftware
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
|
||||||
|
|
||||||
|
public EnrichMoreSoftware() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
|
||||||
|
final Pair<Result, List<Software>> source,
|
||||||
|
final Pair<Result, List<Software>> target) {
|
||||||
|
// TODO
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Software highlightValue,
|
||||||
|
final Pair<Result, List<Software>> source,
|
||||||
|
final Pair<Result, List<Software>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MORE_SOFTWARE,
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, s) -> p.getSoftwares().add(s),
|
||||||
|
s -> s.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -8,11 +8,12 @@ import java.util.stream.Collectors;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class EnrichMoreSubject extends UpdateMatcher<Pair<String, String>> {
|
public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String>> {
|
||||||
|
|
||||||
public EnrichMoreSubject() {
|
public EnrichMoreSubject() {
|
||||||
super(true);
|
super(true);
|
|
@ -4,4 +4,6 @@ package eu.dnetlib.dhp.broker.oa.util;
|
||||||
public class BrokerConstants {
|
public class BrokerConstants {
|
||||||
|
|
||||||
public final static String OPEN_ACCESS = "OPEN";
|
public final static String OPEN_ACCESS = "OPEN";
|
||||||
|
public final static String IS_MERGED_IN_CLASS = "isMergedIn";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,8 @@ import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Instance;
|
import eu.dnetlib.broker.objects.Instance;
|
||||||
import eu.dnetlib.broker.objects.Pid;
|
import eu.dnetlib.broker.objects.Pid;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class ConversionUtils {
|
public class ConversionUtils {
|
||||||
|
@ -33,4 +35,15 @@ public class ConversionUtils {
|
||||||
return Pair.of(sp.getQualifier().getClassid(), sp.getValue());
|
return Pair.of(sp.getQualifier().getClassid(), sp.getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) {
|
||||||
|
final eu.dnetlib.broker.objects.Dataset res = new eu.dnetlib.broker.objects.Dataset();
|
||||||
|
// TODO
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication d) {
|
||||||
|
final eu.dnetlib.broker.objects.Publication res = new eu.dnetlib.broker.objects.Publication();
|
||||||
|
// TODO
|
||||||
|
return res;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<artifactId>dhp-dedup-openaire</artifactId>
|
<artifactId>dhp-dedup-openaire</artifactId>
|
||||||
|
|
|
@ -1,28 +1,26 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
|
||||||
|
|
||||||
import com.wcohen.ss.JaroWinkler;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class AuthorMerger {
|
public class AuthorMerger {
|
||||||
|
|
||||||
private static final Double THRESHOLD = 0.95;
|
private static final Double THRESHOLD = 0.95;
|
||||||
|
|
||||||
public static List<Author> merge(List<List<Author>> authors) {
|
public static List<Author> merge(List<List<Author>> authors) {
|
||||||
|
|
||||||
authors.sort(new Comparator<List<Author>>() {
|
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
|
||||||
@Override
|
|
||||||
public int compare(List<Author> o1, List<Author> o2) {
|
|
||||||
return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
List<Author> author = new ArrayList<>();
|
List<Author> author = new ArrayList<>();
|
||||||
|
|
||||||
|
@ -83,18 +81,30 @@ public class AuthorMerger {
|
||||||
.stream()
|
.stream()
|
||||||
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
||||||
.max(Comparator.comparing(Tuple2::_1));
|
.max(Comparator.comparing(Tuple2::_1));
|
||||||
if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) {
|
|
||||||
|
if (simAuthor.isPresent()) {
|
||||||
|
double th = THRESHOLD;
|
||||||
|
// increase the threshold if the surname is too short
|
||||||
|
if (simAuthor.get()._2().getSurname() != null
|
||||||
|
&& simAuthor.get()._2().getSurname().length() <= 3)
|
||||||
|
th = 0.99;
|
||||||
|
|
||||||
|
if (simAuthor.get()._1() > th) {
|
||||||
Author r = simAuthor.get()._2();
|
Author r = simAuthor.get()._2();
|
||||||
if (r.getPid() == null) {
|
if (r.getPid() == null) {
|
||||||
r.setPid(new ArrayList<>());
|
r.setPid(new ArrayList<>());
|
||||||
}
|
}
|
||||||
r.getPid().add(a._1());
|
r.getPid().add(a._1());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String pidToComparableString(StructuredProperty pid) {
|
public static String pidToComparableString(StructuredProperty pid) {
|
||||||
return (pid.getQualifier()!=null? pid.getQualifier().getClassid()!=null?pid.getQualifier().getClassid().toLowerCase():"":"") + (pid.getValue()!=null? pid.getValue().toLowerCase():"");
|
return (pid.getQualifier() != null
|
||||||
|
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
|
||||||
|
: "")
|
||||||
|
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int countAuthorsPids(List<Author> authors) {
|
public static int countAuthorsPids(List<Author> authors) {
|
||||||
|
@ -115,9 +125,10 @@ public class AuthorMerger {
|
||||||
final Person pa = parse(a);
|
final Person pa = parse(a);
|
||||||
final Person pb = parse(b);
|
final Person pb = parse(b);
|
||||||
|
|
||||||
|
// if both are accurate (e.g. they have name and surname)
|
||||||
if (pa.isAccurate() & pb.isAccurate()) {
|
if (pa.isAccurate() & pb.isAccurate()) {
|
||||||
return new JaroWinkler()
|
return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
|
||||||
.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
|
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
|
||||||
} else {
|
} else {
|
||||||
return new JaroWinkler()
|
return new JaroWinkler()
|
||||||
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
@ -73,7 +74,8 @@ public class DedupRecordFactory {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends OafEntity> T entityMerger(
|
public static <T extends OafEntity> T entityMerger(
|
||||||
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz) throws IllegalAccessException, InstantiationException {
|
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
|
||||||
|
throws IllegalAccessException, InstantiationException {
|
||||||
|
|
||||||
T entity = clazz.newInstance();
|
T entity = clazz.newInstance();
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -7,19 +10,18 @@ import java.io.Serializable;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class EntityMergerTest implements Serializable {
|
public class EntityMergerTest implements Serializable {
|
||||||
|
|
||||||
List<Tuple2<String, Publication>> publications;
|
List<Tuple2<String, Publication>> publications;
|
||||||
|
List<Tuple2<String, Publication>> publications2;
|
||||||
|
|
||||||
String testEntityBasePath;
|
String testEntityBasePath;
|
||||||
DataInfo dataInfo;
|
DataInfo dataInfo;
|
||||||
|
@ -35,6 +37,7 @@ public class EntityMergerTest implements Serializable {
|
||||||
.getAbsolutePath();
|
.getAbsolutePath();
|
||||||
|
|
||||||
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
||||||
|
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
|
||||||
|
|
||||||
pub_top = getTopPub(publications);
|
pub_top = getTopPub(publications);
|
||||||
|
|
||||||
|
@ -45,7 +48,8 @@ public class EntityMergerTest implements Serializable {
|
||||||
@Test
|
@Test
|
||||||
public void publicationMergerTest() throws InstantiationException, IllegalAccessException {
|
public void publicationMergerTest() throws InstantiationException, IllegalAccessException {
|
||||||
|
|
||||||
Publication pub_merged = DedupRecordFactory.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
Publication pub_merged = DedupRecordFactory
|
||||||
|
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
assertEquals(dedupId, pub_merged.getId());
|
assertEquals(dedupId, pub_merged.getId());
|
||||||
|
|
||||||
|
@ -86,6 +90,25 @@ public class EntityMergerTest implements Serializable {
|
||||||
// verify authors
|
// verify authors
|
||||||
assertEquals(pub_merged.getAuthor().size(), 9);
|
assertEquals(pub_merged.getAuthor().size(), 9);
|
||||||
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
||||||
|
|
||||||
|
// verify title
|
||||||
|
int count = 0;
|
||||||
|
for (StructuredProperty title : pub_merged.getTitle()) {
|
||||||
|
if (title.getQualifier().getClassid().equals("main title"))
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(count, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void publicationMergerTest2() throws InstantiationException, IllegalAccessException {
|
||||||
|
|
||||||
|
Publication pub_merged = DedupRecordFactory
|
||||||
|
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
|
assertEquals(pub_merged.getAuthor().size(), 27);
|
||||||
|
// insert assertions here
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public DataInfo setDI() {
|
public DataInfo setDI() {
|
||||||
|
@ -118,11 +141,11 @@ public class EntityMergerTest implements Serializable {
|
||||||
reader = new BufferedReader(new FileReader(path));
|
reader = new BufferedReader(new FileReader(path));
|
||||||
String line = reader.readLine();
|
String line = reader.readLine();
|
||||||
while (line != null) {
|
while (line != null) {
|
||||||
res.add(
|
res
|
||||||
|
.add(
|
||||||
new Tuple2<>(
|
new Tuple2<>(
|
||||||
MapDocumentUtil.getJPathString("$.id", line),
|
MapDocumentUtil.getJPathString("$.id", line),
|
||||||
new ObjectMapper().readValue(line, clazz))
|
new ObjectMapper().readValue(line, clazz)));
|
||||||
);
|
|
||||||
// read next line
|
// read next line
|
||||||
line = reader.readLine();
|
line = reader.readLine();
|
||||||
}
|
}
|
||||||
|
@ -134,5 +157,4 @@ public class EntityMergerTest implements Serializable {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,9 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.bulktag;
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
|
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -100,6 +101,7 @@ public class SparkBulkTagJob {
|
||||||
|
|
||||||
ResultTagger resultTagger = new ResultTagger();
|
ResultTagger resultTagger = new ResultTagger();
|
||||||
readPath(spark, inputPath, resultClazz)
|
readPath(spark, inputPath, resultClazz)
|
||||||
|
.map(patchResult(), Encoders.bean(resultClazz))
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<R, R>) value -> resultTagger
|
(MapFunction<R, R>) value -> resultTagger
|
||||||
.enrichContextCriteria(
|
.enrichContextCriteria(
|
||||||
|
@ -119,4 +121,17 @@ public class SparkBulkTagJob {
|
||||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO remove this hack as soon as the values fixed by this method will be provided as NON null
|
||||||
|
private static <R extends Result> MapFunction<R, R> patchResult() {
|
||||||
|
return (MapFunction<R, R>) r -> {
|
||||||
|
if (r.getDataInfo().getDeletedbyinference() == null) {
|
||||||
|
r.getDataInfo().setDeletedbyinference(false);
|
||||||
|
}
|
||||||
|
if (r.getContext() == null) {
|
||||||
|
r.setContext(new ArrayList<>());
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -131,7 +131,7 @@ public class CommunityConfiguration implements Serializable {
|
||||||
p -> {
|
p -> {
|
||||||
if (p.getSnd() == null)
|
if (p.getSnd() == null)
|
||||||
return p.getFst();
|
return p.getFst();
|
||||||
if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
|
if (p.getSnd().verifyCriteria(param))
|
||||||
return p.getFst();
|
return p.getFst();
|
||||||
else
|
else
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class VerbResolver implements Serializable {
|
||||||
.collect(
|
.collect(
|
||||||
Collectors
|
Collectors
|
||||||
.toMap(
|
.toMap(
|
||||||
value -> (String) ((ClassInfo) value)
|
value -> (String) value
|
||||||
.getAnnotationInfo()
|
.getAnnotationInfo()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getParameterValues()
|
.getParameterValues()
|
||||||
|
|
|
@ -77,9 +77,15 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
List<String> allowedtypes,
|
List<String> allowedtypes,
|
||||||
String inputPath,
|
String inputPath,
|
||||||
String outputPath) {
|
String outputPath) {
|
||||||
String whitelisted = "";
|
String whitelisted = " d.id = '" + whitelist.get(0) + "'";
|
||||||
for (String i : whitelist) {
|
for (int i = 1; i < whitelist.size(); i++) {
|
||||||
whitelisted += " OR id = '" + i + "'";
|
whitelisted += " OR d.id = '" + whitelist.get(i) + "'";
|
||||||
|
}
|
||||||
|
|
||||||
|
String allowed = "d.datasourcetype.classid = '" + allowedtypes.get(0) + "'";
|
||||||
|
|
||||||
|
for (int i = 1; i < allowedtypes.size(); i++) {
|
||||||
|
allowed += " OR d.datasourcetype.classid = '" + allowedtypes.get(i) + "'";
|
||||||
}
|
}
|
||||||
|
|
||||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||||
|
@ -90,26 +96,39 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
relation.createOrReplaceTempView("relation");
|
relation.createOrReplaceTempView("relation");
|
||||||
organization.createOrReplaceTempView("organization");
|
organization.createOrReplaceTempView("organization");
|
||||||
|
|
||||||
String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
|
// String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
|
||||||
+ "FROM ( SELECT id "
|
// + "FROM ( SELECT id "
|
||||||
+ " FROM datasource "
|
// + " FROM datasource "
|
||||||
+ " WHERE (datainfo.deletedbyinference = false "
|
// + " WHERE (datainfo.deletedbyinference = false "
|
||||||
+ whitelisted
|
// + whitelisted
|
||||||
+ ") "
|
// + ") "
|
||||||
+ getConstraintList("datasourcetype.classid = '", allowedtypes)
|
// + getConstraintList("datasourcetype.classid = '", allowedtypes)
|
||||||
+ ") d "
|
// + ") d "
|
||||||
+ "JOIN ( SELECT source, target "
|
// + "JOIN ( SELECT source, target "
|
||||||
+ " FROM relation "
|
// + " FROM relation "
|
||||||
+ " WHERE relclass = '"
|
// + " WHERE relclass = '"
|
||||||
+ ModelConstants.IS_PROVIDED_BY
|
// + ModelConstants.IS_PROVIDED_BY
|
||||||
+ "' "
|
// + "' "
|
||||||
+ " AND datainfo.deletedbyinference = false ) rel "
|
// + " AND datainfo.deletedbyinference = false ) rel "
|
||||||
+ "ON d.id = rel.source "
|
// + "ON d.id = rel.source "
|
||||||
+ "JOIN (SELECT id, country "
|
// + "JOIN (SELECT id, country "
|
||||||
+ " FROM organization "
|
// + " FROM organization "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
// + " WHERE datainfo.deletedbyinference = false "
|
||||||
+ " AND length(country.classid) > 0) o "
|
// + " AND length(country.classid) > 0) o "
|
||||||
+ "ON o.id = rel.target";
|
// + "ON o.id = rel.target";
|
||||||
|
|
||||||
|
String query = "SELECT source dataSourceId, " +
|
||||||
|
"named_struct('classid', country.classid, 'classname', country.classname) country " +
|
||||||
|
"FROM datasource d " +
|
||||||
|
"JOIN relation rel " +
|
||||||
|
"ON d.id = rel.source " +
|
||||||
|
"JOIN organization o " +
|
||||||
|
"ON o.id = rel.target " +
|
||||||
|
"WHERE rel.datainfo.deletedbyinference = false " +
|
||||||
|
"and rel.relclass = '" + ModelConstants.IS_PROVIDED_BY + "'" +
|
||||||
|
"and o.datainfo.deletedbyinference = false " +
|
||||||
|
"and length(o.country.classid) > 0 " +
|
||||||
|
"and (" + allowed + " or " + whitelisted + ")";
|
||||||
|
|
||||||
spark
|
spark
|
||||||
.sql(query)
|
.sql(query)
|
||||||
|
|
|
@ -4,7 +4,12 @@ package eu.dnetlib.dhp.countrypropagation;
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -13,6 +18,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class PrepareResultCountrySet {
|
public class PrepareResultCountrySet {
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
|
||||||
|
@ -60,6 +66,7 @@ public class PrepareResultCountrySet {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
getPotentialResultToUpdate(
|
getPotentialResultToUpdate(
|
||||||
spark,
|
spark,
|
||||||
inputPath,
|
inputPath,
|
||||||
|
@ -89,10 +96,33 @@ public class PrepareResultCountrySet {
|
||||||
spark
|
spark
|
||||||
.sql(RESULT_COUNTRYSET_QUERY)
|
.sql(RESULT_COUNTRYSET_QUERY)
|
||||||
.as(Encoders.bean(ResultCountrySet.class))
|
.as(Encoders.bean(ResultCountrySet.class))
|
||||||
.write()
|
.toJavaRDD()
|
||||||
.option("compression", "gzip")
|
.mapToPair(value -> new Tuple2<>(value.getResultId(), value))
|
||||||
.mode(SaveMode.Append)
|
.reduceByKey((a, b) -> {
|
||||||
.json(outputPath);
|
ArrayList<CountrySbs> countryList = a.getCountrySet();
|
||||||
|
Set<String> countryCodes = countryList
|
||||||
|
.stream()
|
||||||
|
.map(country -> country.getClassid())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
b
|
||||||
|
.getCountrySet()
|
||||||
|
.stream()
|
||||||
|
.forEach(c -> {
|
||||||
|
if (!countryCodes.contains(c.getClassid())) {
|
||||||
|
countryList.add(c);
|
||||||
|
countryCodes.add(c.getClassid());
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
a.setCountrySet(countryList);
|
||||||
|
return a;
|
||||||
|
})
|
||||||
|
.map(couple -> OBJECT_MAPPER.writeValueAsString(couple._2()))
|
||||||
|
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||||
|
// .write()
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .mode(SaveMode.Append)
|
||||||
|
// .json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
@ -121,30 +122,39 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void enrichAuthor(Author a, List<AutoritativeAuthor> au) {
|
private static void enrichAuthor(Author a, List<AutoritativeAuthor> au) {
|
||||||
|
PacePerson pp = new PacePerson(a.getFullname(), false);
|
||||||
for (AutoritativeAuthor aa : au) {
|
for (AutoritativeAuthor aa : au) {
|
||||||
if (enrichAuthor(aa, a)) {
|
if (enrichAuthor(aa, a, pp.getNormalisedFirstName(), pp.getNormalisedSurname())) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) {
|
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author,
|
||||||
|
String author_name,
|
||||||
|
String author_surname) {
|
||||||
boolean toaddpid = false;
|
boolean toaddpid = false;
|
||||||
|
|
||||||
if (StringUtils.isNotEmpty(autoritative_author.getSurname())) {
|
if (StringUtils.isNotEmpty(autoritative_author.getSurname())) {
|
||||||
if (StringUtils.isNotEmpty(author.getSurname())) {
|
if (StringUtils.isNotEmpty(author.getSurname())) {
|
||||||
|
author_surname = author.getSurname();
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotEmpty(author_surname)) {
|
||||||
if (autoritative_author
|
if (autoritative_author
|
||||||
.getSurname()
|
.getSurname()
|
||||||
.trim()
|
.trim()
|
||||||
.equalsIgnoreCase(author.getSurname().trim())) {
|
.equalsIgnoreCase(author_surname.trim())) {
|
||||||
|
|
||||||
// have the same surname. Check the name
|
// have the same surname. Check the name
|
||||||
if (StringUtils.isNotEmpty(autoritative_author.getName())) {
|
if (StringUtils.isNotEmpty(autoritative_author.getName())) {
|
||||||
if (StringUtils.isNotEmpty(author.getName())) {
|
if (StringUtils.isNotEmpty(author.getName())) {
|
||||||
|
author_name = author.getName();
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotEmpty(author_name)) {
|
||||||
if (autoritative_author
|
if (autoritative_author
|
||||||
.getName()
|
.getName()
|
||||||
.trim()
|
.trim()
|
||||||
.equalsIgnoreCase(author.getName().trim())) {
|
.equalsIgnoreCase(author_name.trim())) {
|
||||||
toaddpid = true;
|
toaddpid = true;
|
||||||
}
|
}
|
||||||
// they could be differently written (i.e. only the initials of the name
|
// they could be differently written (i.e. only the initials of the name
|
||||||
|
@ -154,7 +164,7 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
.getName()
|
.getName()
|
||||||
.trim()
|
.trim()
|
||||||
.substring(0, 0)
|
.substring(0, 0)
|
||||||
.equalsIgnoreCase(author.getName().trim().substring(0, 0))) {
|
.equalsIgnoreCase(author_name.trim().substring(0, 0))) {
|
||||||
toaddpid = true;
|
toaddpid = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -105,11 +105,7 @@ public class SparkResultToProjectThroughSemRelJob {
|
||||||
.stream()
|
.stream()
|
||||||
.forEach(
|
.forEach(
|
||||||
(p -> {
|
(p -> {
|
||||||
if (potential_update
|
|
||||||
.getProjectSet()
|
|
||||||
.contains(p)) {
|
|
||||||
potential_update.getProjectSet().remove(p);
|
potential_update.getProjectSet().remove(p);
|
||||||
}
|
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
String resId = potential_update.getResultId();
|
String resId = potential_update.getResultId();
|
||||||
|
|
|
@ -7,6 +7,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
|
@ -19,6 +20,7 @@ import com.google.gson.Gson;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class PrepareResultCommunitySet {
|
public class PrepareResultCommunitySet {
|
||||||
|
|
||||||
|
@ -93,10 +95,24 @@ public class PrepareResultCommunitySet {
|
||||||
result_organizationset
|
result_organizationset
|
||||||
.map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
|
.map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.write()
|
.toJavaRDD()
|
||||||
.mode(SaveMode.Overwrite)
|
.mapToPair(value -> new Tuple2<>(value.getResultId(), value))
|
||||||
.option("compression", "gzip")
|
.reduceByKey((a, b) -> {
|
||||||
.json(outputPath);
|
ArrayList<String> cl = a.getCommunityList();
|
||||||
|
b.getCommunityList().stream().forEach(s -> {
|
||||||
|
if (!cl.contains(s)) {
|
||||||
|
cl.add(s);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
a.setCommunityList(cl);
|
||||||
|
return a;
|
||||||
|
})
|
||||||
|
.map(value -> OBJECT_MAPPER.writeValueAsString(value._2()))
|
||||||
|
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static MapFunction<ResultOrganizations, ResultCommunityList> mapResultCommunityFn(
|
private static MapFunction<ResultOrganizations, ResultCommunityList> mapResultCommunityFn(
|
||||||
|
|
|
@ -136,9 +136,7 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
.stream()
|
.stream()
|
||||||
.forEach(
|
.forEach(
|
||||||
rId -> {
|
rId -> {
|
||||||
if (organization_list.contains(rId)) {
|
|
||||||
organization_list.remove(rId);
|
organization_list.remove(rId);
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
String resultId = potential_update.getResultId();
|
String resultId = potential_update.getResultId();
|
||||||
|
|
|
@ -99,6 +99,7 @@ public class ResultToOrganizationJobTest {
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
|
||||||
|
|
||||||
Assertions.assertEquals(0, tmp.count());
|
Assertions.assertEquals(0, tmp.count());
|
||||||
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -171,6 +172,7 @@ public class ResultToOrganizationJobTest {
|
||||||
+ "(target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091' "
|
+ "(target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091' "
|
||||||
+ "or target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2')")
|
+ "or target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2')")
|
||||||
.count());
|
.count());
|
||||||
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -266,5 +268,6 @@ public class ResultToOrganizationJobTest {
|
||||||
"relclass = 'isAuthorInstitutionOf' and "
|
"relclass = 'isAuthorInstitutionOf' and "
|
||||||
+ "substring(source, 1,2) = '20' and substring(target, 1, 2) = '50'")
|
+ "substring(source, 1,2) = '20' and substring(target, 1, 2) = '50'")
|
||||||
.count());
|
.count());
|
||||||
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -11,13 +11,9 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
|
@ -25,6 +21,7 @@ import org.dom4j.DocumentFactory;
|
||||||
import org.dom4j.DocumentHelper;
|
import org.dom4j.DocumentHelper;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.LicenseComparator;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
@ -50,6 +47,10 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
|
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
|
||||||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||||
|
protected static final Qualifier ORCID_PID_TYPE = qualifier(
|
||||||
|
"ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES);
|
||||||
|
protected static final Qualifier MAG_PID_TYPE = qualifier(
|
||||||
|
"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);
|
||||||
|
|
||||||
protected static final Map<String, String> nsContext = new HashMap<>();
|
protected static final Map<String, String> nsContext = new HashMap<>();
|
||||||
|
|
||||||
|
@ -75,8 +76,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||||
|
|
||||||
final Document doc = DocumentHelper
|
final Document doc = DocumentHelper
|
||||||
.parseText(
|
.parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3));
|
||||||
xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3));
|
|
||||||
|
|
||||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
||||||
final KeyValue collectedFrom = getProvenanceDatasource(
|
final KeyValue collectedFrom = getProvenanceDatasource(
|
||||||
|
@ -103,7 +103,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private KeyValue getProvenanceDatasource(Document doc, String xpathId, String xpathName) {
|
private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
|
||||||
final String dsId = doc.valueOf(xpathId);
|
final String dsId = doc.valueOf(xpathId);
|
||||||
final String dsName = doc.valueOf(xpathName);
|
final String dsName = doc.valueOf(xpathName);
|
||||||
|
|
||||||
|
@ -111,9 +111,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return keyValue(
|
return keyValue(createOpenaireId(10, dsId, true), dsName);
|
||||||
createOpenaireId(10, dsId, true),
|
|
||||||
dsName);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected List<Oaf> createOafs(
|
protected List<Oaf> createOafs(
|
||||||
|
@ -211,8 +209,14 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Relation getRelation(String source, String target, String relType, String subRelType, String relClass,
|
protected Relation getRelation(final String source,
|
||||||
KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) {
|
final String target,
|
||||||
|
final String relType,
|
||||||
|
final String subRelType,
|
||||||
|
final String relClass,
|
||||||
|
final KeyValue collectedFrom,
|
||||||
|
final DataInfo info,
|
||||||
|
final long lastUpdateTimestamp) {
|
||||||
final Relation rel = new Relation();
|
final Relation rel = new Relation();
|
||||||
rel.setRelType(relType);
|
rel.setRelType(relType);
|
||||||
rel.setSubRelType(subRelType);
|
rel.setSubRelType(subRelType);
|
||||||
|
@ -269,7 +273,9 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
r.setCoverage(prepareCoverages(doc, info));
|
r.setCoverage(prepareCoverages(doc, info));
|
||||||
r.setContext(prepareContexts(doc, info));
|
r.setContext(prepareContexts(doc, info));
|
||||||
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
|
final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
|
||||||
|
r.setInstance(instances);
|
||||||
|
r.setBestaccessright(getBestAccessRights(instances));
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Context> prepareContexts(final Document doc, final DataInfo info) {
|
private List<Context> prepareContexts(final Document doc, final DataInfo info) {
|
||||||
|
@ -289,7 +295,10 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
||||||
|
|
||||||
protected abstract List<Instance> prepareInstances(
|
protected abstract List<Instance> prepareInstances(
|
||||||
Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
|
Document doc,
|
||||||
|
DataInfo info,
|
||||||
|
KeyValue collectedfrom,
|
||||||
|
KeyValue hostedby);
|
||||||
|
|
||||||
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
|
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
@ -314,13 +323,16 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
|
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
|
||||||
|
|
||||||
protected abstract List<Field<String>> prepareOtherResearchProductTools(
|
protected abstract List<Field<String>> prepareOtherResearchProductTools(
|
||||||
Document doc, DataInfo info);
|
Document doc,
|
||||||
|
DataInfo info);
|
||||||
|
|
||||||
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(
|
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(
|
||||||
Document doc, DataInfo info);
|
Document doc,
|
||||||
|
DataInfo info);
|
||||||
|
|
||||||
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(
|
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(
|
||||||
Document doc, DataInfo info);
|
Document doc,
|
||||||
|
DataInfo info);
|
||||||
|
|
||||||
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
|
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
@ -329,7 +341,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
|
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
|
||||||
|
|
||||||
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(
|
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(
|
||||||
Document doc, DataInfo info);
|
Document doc,
|
||||||
|
DataInfo info);
|
||||||
|
|
||||||
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
|
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
@ -345,6 +358,34 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected static Qualifier getBestAccessRights(List<Instance> instanceList) {
|
||||||
|
if (instanceList != null) {
|
||||||
|
final Optional<Qualifier> min = instanceList
|
||||||
|
.stream()
|
||||||
|
.map(i -> i.getAccessright())
|
||||||
|
.min(new LicenseComparator());
|
||||||
|
|
||||||
|
final Qualifier rights = min.isPresent() ? min.get() : new Qualifier();
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(rights.getClassid())) {
|
||||||
|
rights.setClassid(UNKNOWN);
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(rights.getClassname())
|
||||||
|
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
||||||
|
rights.setClassname(NOT_AVAILABLE);
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(rights.getSchemeid())) {
|
||||||
|
rights.setSchemeid(DNET_ACCESS_MODES);
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(rights.getSchemename())) {
|
||||||
|
rights.setSchemename(DNET_ACCESS_MODES);
|
||||||
|
}
|
||||||
|
|
||||||
|
return rights;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
||||||
final Node n = doc.selectSingleNode("//oaf:journal");
|
final Node n = doc.selectSingleNode("//oaf:journal");
|
||||||
if (n != null) {
|
if (n != null) {
|
||||||
|
@ -358,26 +399,17 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final String vol = n.valueOf("@vol");
|
final String vol = n.valueOf("@vol");
|
||||||
final String edition = n.valueOf("@edition");
|
final String edition = n.valueOf("@edition");
|
||||||
if (StringUtils.isNotBlank(name)) {
|
if (StringUtils.isNotBlank(name)) {
|
||||||
return journal(
|
return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info);
|
||||||
name,
|
|
||||||
issnPrinted,
|
|
||||||
issnOnline,
|
|
||||||
issnLinking,
|
|
||||||
ep,
|
|
||||||
iss,
|
|
||||||
sp,
|
|
||||||
vol,
|
|
||||||
edition,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
info);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Qualifier prepareQualifier(
|
protected Qualifier prepareQualifier(
|
||||||
final Node node, final String xpath, final String schemeId, final String schemeName) {
|
final Node node,
|
||||||
|
final String xpath,
|
||||||
|
final String schemeId,
|
||||||
|
final String schemeName) {
|
||||||
final String classId = node.valueOf(xpath);
|
final String classId = node.valueOf(xpath);
|
||||||
final String className = code2name.get(classId);
|
final String className = code2name.get(classId);
|
||||||
return qualifier(classId, className, schemeId, schemeName);
|
return qualifier(classId, className, schemeId, schemeName);
|
||||||
|
@ -401,7 +433,10 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected List<StructuredProperty> prepareListStructProps(
|
protected List<StructuredProperty> prepareListStructProps(
|
||||||
final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
|
final Node node,
|
||||||
|
final String xpath,
|
||||||
|
final Qualifier qualifier,
|
||||||
|
final DataInfo info) {
|
||||||
final List<StructuredProperty> res = new ArrayList<>();
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
for (final Object o : node.selectNodes(xpath)) {
|
for (final Object o : node.selectNodes(xpath)) {
|
||||||
final Node n = (Node) o;
|
final Node n = (Node) o;
|
||||||
|
@ -411,19 +446,17 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected List<StructuredProperty> prepareListStructProps(
|
protected List<StructuredProperty> prepareListStructProps(
|
||||||
final Node node, final String xpath, final DataInfo info) {
|
final Node node,
|
||||||
|
final String xpath,
|
||||||
|
final DataInfo info) {
|
||||||
final List<StructuredProperty> res = new ArrayList<>();
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
for (final Object o : node.selectNodes(xpath)) {
|
for (final Object o : node.selectNodes(xpath)) {
|
||||||
final Node n = (Node) o;
|
final Node n = (Node) o;
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
structuredProperty(
|
structuredProperty(
|
||||||
n.getText(),
|
n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"),
|
||||||
n.valueOf("@classid"),
|
n.valueOf("@schemename"), info));
|
||||||
n.valueOf("@classname"),
|
|
||||||
n.valueOf("@schemeid"),
|
|
||||||
n.valueOf("@schemename"),
|
|
||||||
info));
|
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
@ -449,8 +482,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final Node n = doc.selectSingleNode("//oaf:datainfo");
|
final Node n = doc.selectSingleNode("//oaf:datainfo");
|
||||||
|
|
||||||
if (n == null) {
|
if (n == null) {
|
||||||
return dataInfo(
|
return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9");
|
||||||
false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
|
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
|
||||||
|
@ -464,12 +496,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final String trust = n.valueOf("./oaf:trust");
|
final String trust = n.valueOf("./oaf:trust");
|
||||||
|
|
||||||
return dataInfo(
|
return dataInfo(
|
||||||
deletedbyinference,
|
deletedbyinference, inferenceprovenance, inferred, false,
|
||||||
inferenceprovenance,
|
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
||||||
inferred,
|
|
||||||
false,
|
|
||||||
qualifier(paClassId, paClassName, paSchemeId, paSchemeName),
|
|
||||||
trust);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
|
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
|
||||||
|
@ -477,7 +505,9 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected List<Field<String>> prepareListFields(
|
protected List<Field<String>> prepareListFields(
|
||||||
final Node node, final String xpath, final DataInfo info) {
|
final Node node,
|
||||||
|
final String xpath,
|
||||||
|
final DataInfo info) {
|
||||||
return listFields(info, prepareListString(node, xpath));
|
return listFields(info, prepareListString(node, xpath));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,19 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
||||||
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
@ -14,9 +23,16 @@ import org.dom4j.Node;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
|
@ -39,14 +55,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
author.setSurname(p.getNormalisedSurname());
|
author.setSurname(p.getNormalisedSurname());
|
||||||
}
|
}
|
||||||
|
|
||||||
final String pid = e.attributeValue("nameIdentifier");
|
final String pid = e.valueOf("./@nameIdentifier");
|
||||||
final String pidType = e.attributeValue("nameIdentifierScheme");
|
final String type = e
|
||||||
|
.valueOf("./@nameIdentifierScheme")
|
||||||
|
.trim()
|
||||||
|
.toUpperCase()
|
||||||
|
.replaceAll(" ", "")
|
||||||
|
.replaceAll("_", "");
|
||||||
|
|
||||||
author.setPid(new ArrayList<>());
|
author.setPid(new ArrayList<>());
|
||||||
if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) {
|
|
||||||
author
|
if (StringUtils.isNotBlank(pid)) {
|
||||||
.getPid()
|
if (type.startsWith("ORCID")) {
|
||||||
.add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info));
|
final String cleanedId = pid
|
||||||
|
.replaceAll("http://orcid.org/", "")
|
||||||
|
.replaceAll("https://orcid.org/", "");
|
||||||
|
author.getPid().add(structuredProperty(cleanedId, ORCID_PID_TYPE, info));
|
||||||
|
} else if (type.startsWith("MAGID")) {
|
||||||
|
author.getPid().add(structuredProperty(pid, MAG_PID_TYPE, info));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
res.add(author);
|
res.add(author);
|
||||||
|
@ -104,28 +131,21 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
final Instance instance = new Instance();
|
final Instance instance = new Instance();
|
||||||
instance
|
instance
|
||||||
.setInstancetype(
|
.setInstancetype(
|
||||||
prepareQualifier(
|
prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE));
|
||||||
doc,
|
|
||||||
"//dr:CobjCategory",
|
|
||||||
DNET_PUBLICATION_RESOURCE,
|
|
||||||
DNET_PUBLICATION_RESOURCE));
|
|
||||||
instance.setCollectedfrom(collectedfrom);
|
instance.setCollectedfrom(collectedfrom);
|
||||||
instance.setHostedby(hostedby);
|
instance.setHostedby(hostedby);
|
||||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||||
instance
|
instance
|
||||||
.setAccessright(
|
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES));
|
||||||
prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES));
|
|
||||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||||
instance
|
instance
|
||||||
.setProcessingchargeamount(
|
.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||||
field(doc.valueOf("//oaf:processingchargeamount"), info));
|
|
||||||
instance
|
instance
|
||||||
.setProcessingchargecurrency(
|
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||||
field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
|
||||||
|
|
||||||
List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
||||||
instance
|
instance
|
||||||
.setUrl(
|
.setUrl(
|
||||||
nodes
|
nodes
|
||||||
|
@ -158,19 +178,22 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareSoftwareCodeRepositoryUrl(
|
protected Field<String> prepareSoftwareCodeRepositoryUrl(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return null; // NOT PRESENT IN OAF
|
return null; // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<StructuredProperty> prepareSoftwareLicenses(
|
protected List<StructuredProperty> prepareSoftwareLicenses(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareSoftwareDocumentationUrls(
|
protected List<Field<String>> prepareSoftwareDocumentationUrls(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,13 +205,15 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetMetadataVersionNumber(
|
protected Field<String> prepareDatasetMetadataVersionNumber(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return null; // NOT PRESENT IN OAF
|
return null; // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetLastMetadataUpdate(
|
protected Field<String> prepareDatasetLastMetadataUpdate(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return null; // NOT PRESENT IN OAF
|
return null; // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -216,19 +241,22 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareOtherResearchProductTools(
|
protected List<Field<String>> prepareOtherResearchProductTools(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareOtherResearchProductContactGroups(
|
protected List<Field<String>> prepareOtherResearchProductContactGroups(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareOtherResearchProductContactPersons(
|
protected List<Field<String>> prepareOtherResearchProductContactPersons(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,16 +4,31 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENTED_BY;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENT_TO;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PART;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.SUPPLEMENT;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
@ -22,7 +37,6 @@ import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
@ -48,7 +62,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
final String fullname = n.valueOf("./datacite:creatorName");
|
final String fullname = n.valueOf("./datacite:creatorName");
|
||||||
author.setFullname(fullname);
|
author.setFullname(fullname);
|
||||||
|
|
||||||
PacePerson pp = new PacePerson(fullname, false);
|
final PacePerson pp = new PacePerson(fullname, false);
|
||||||
final String name = n.valueOf("./datacite:givenName");
|
final String name = n.valueOf("./datacite:givenName");
|
||||||
if (StringUtils.isBlank(name) & pp.isAccurate()) {
|
if (StringUtils.isBlank(name) & pp.isAccurate()) {
|
||||||
author.setName(pp.getNormalisedFirstName());
|
author.setName(pp.getNormalisedFirstName());
|
||||||
|
@ -63,6 +77,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
author.setSurname(surname);
|
author.setSurname(surname);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(author.getFullname())) {
|
||||||
|
author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
|
||||||
|
}
|
||||||
|
|
||||||
author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info));
|
author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info));
|
||||||
author.setPid(preparePids(n, info));
|
author.setPid(preparePids(n, info));
|
||||||
author.setRank(pos++);
|
author.setRank(pos++);
|
||||||
|
@ -74,13 +92,21 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
private List<StructuredProperty> preparePids(final Node n, final DataInfo info) {
|
private List<StructuredProperty> preparePids(final Node n, final DataInfo info) {
|
||||||
final List<StructuredProperty> res = new ArrayList<>();
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
for (final Object o : n.selectNodes("./datacite:nameIdentifier")) {
|
for (final Object o : n.selectNodes("./datacite:nameIdentifier")) {
|
||||||
res
|
|
||||||
.add(
|
final String id = ((Node) o).getText();
|
||||||
structuredProperty(
|
final String type = ((Node) o)
|
||||||
((Node) o).getText(),
|
.valueOf("./@nameIdentifierScheme")
|
||||||
prepareQualifier(
|
.trim()
|
||||||
(Node) o, "./@nameIdentifierScheme", DNET_PID_TYPES, DNET_PID_TYPES),
|
.toUpperCase()
|
||||||
info));
|
.replaceAll(" ", "")
|
||||||
|
.replaceAll("_", "");
|
||||||
|
|
||||||
|
if (type.startsWith("ORCID")) {
|
||||||
|
final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", "");
|
||||||
|
res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info));
|
||||||
|
} else if (type.startsWith("MAGID")) {
|
||||||
|
res.add(structuredProperty(id, MAG_PID_TYPE, info));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
@ -95,21 +121,18 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
final Instance instance = new Instance();
|
final Instance instance = new Instance();
|
||||||
instance
|
instance
|
||||||
.setInstancetype(
|
.setInstancetype(
|
||||||
prepareQualifier(
|
prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE));
|
||||||
doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE));
|
|
||||||
instance.setCollectedfrom(collectedfrom);
|
instance.setCollectedfrom(collectedfrom);
|
||||||
instance.setHostedby(hostedby);
|
instance.setHostedby(hostedby);
|
||||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||||
instance
|
instance
|
||||||
.setAccessright(
|
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES));
|
||||||
prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES));
|
|
||||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||||
instance
|
instance
|
||||||
.setProcessingchargecurrency(
|
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||||
field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
|
||||||
|
|
||||||
final Set<String> url = new HashSet<>();
|
final Set<String> url = new HashSet<>();
|
||||||
for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
||||||
|
@ -149,11 +172,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
structuredProperty(
|
structuredProperty(
|
||||||
((Node) o).getText(),
|
((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE,
|
||||||
"UNKNOWN",
|
|
||||||
"UNKNOWN",
|
|
||||||
DNET_DATA_CITE_DATE,
|
|
||||||
DNET_DATA_CITE_DATE,
|
|
||||||
info));
|
info));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -197,53 +216,52 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareOtherResearchProductTools(
|
protected List<Field<String>> prepareOtherResearchProductTools(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return new ArrayList<>(); // Not present in ODF ???
|
return new ArrayList<>(); // Not present in ODF ???
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareOtherResearchProductContactGroups(
|
protected List<Field<String>> prepareOtherResearchProductContactGroups(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return prepareListFields(
|
return prepareListFields(
|
||||||
doc,
|
doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info);
|
||||||
"//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName",
|
|
||||||
info);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareOtherResearchProductContactPersons(
|
protected List<Field<String>> prepareOtherResearchProductContactPersons(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return prepareListFields(
|
return prepareListFields(
|
||||||
doc,
|
doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info);
|
||||||
"//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName",
|
|
||||||
info);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||||
return prepareQualifier(
|
return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages");
|
||||||
doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareSoftwareCodeRepositoryUrl(
|
protected Field<String> prepareSoftwareCodeRepositoryUrl(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return null; // Not present in ODF ???
|
return null; // Not present in ODF ???
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<StructuredProperty> prepareSoftwareLicenses(
|
protected List<StructuredProperty> prepareSoftwareLicenses(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return new ArrayList<>(); // Not present in ODF ???
|
return new ArrayList<>(); // Not present in ODF ???
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareSoftwareDocumentationUrls(
|
protected List<Field<String>> prepareSoftwareDocumentationUrls(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return prepareListFields(
|
return prepareListFields(
|
||||||
doc,
|
doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
|
||||||
"//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']",
|
|
||||||
info);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// DATASETS
|
// DATASETS
|
||||||
|
@ -264,13 +282,15 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetMetadataVersionNumber(
|
protected Field<String> prepareDatasetMetadataVersionNumber(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return null; // Not present in ODF ???
|
return null; // Not present in ODF ???
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetLastMetadataUpdate(
|
protected Field<String> prepareDatasetLastMetadataUpdate(
|
||||||
final Document doc, final DataInfo info) {
|
final Document doc,
|
||||||
|
final DataInfo info) {
|
||||||
return prepareField(doc, "//datacite:date[@dateType='Updated']", info);
|
return prepareField(doc, "//datacite:date[@dateType='Updated']", info);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -346,9 +366,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
@Override
|
@Override
|
||||||
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
||||||
return prepareQualifier(
|
return prepareQualifier(
|
||||||
doc,
|
doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE,
|
||||||
"//*[local-name() = 'resource']//*[local-name() = 'resourceType']",
|
|
||||||
DNET_DATA_CITE_RESOURCE,
|
|
||||||
DNET_DATA_CITE_RESOURCE);
|
DNET_DATA_CITE_RESOURCE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,7 +21,14 @@ import org.mockito.Mock;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
public class MappersTest {
|
public class MappersTest {
|
||||||
|
@ -54,13 +61,13 @@ public class MappersTest {
|
||||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||||
|
|
||||||
assertTrue(p.getAuthor().size() > 0);
|
assertTrue(p.getAuthor().size() > 0);
|
||||||
Optional<Author> author = p
|
final Optional<Author> author = p
|
||||||
.getAuthor()
|
.getAuthor()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
||||||
.findFirst();
|
.findFirst();
|
||||||
assertTrue(author.isPresent());
|
assertTrue(author.isPresent());
|
||||||
StructuredProperty pid = author
|
final StructuredProperty pid = author
|
||||||
.get()
|
.get()
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
|
@ -68,7 +75,7 @@ public class MappersTest {
|
||||||
.get();
|
.get();
|
||||||
assertEquals("0000-0001-6651-1178", pid.getValue());
|
assertEquals("0000-0001-6651-1178", pid.getValue());
|
||||||
assertEquals("ORCID", pid.getQualifier().getClassid());
|
assertEquals("ORCID", pid.getQualifier().getClassid());
|
||||||
assertEquals("ORCID", pid.getQualifier().getClassname());
|
assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname());
|
||||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
||||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
||||||
assertEquals("Votsi,Nefta", author.get().getFullname());
|
assertEquals("Votsi,Nefta", author.get().getFullname());
|
||||||
|
@ -78,8 +85,19 @@ public class MappersTest {
|
||||||
assertTrue(p.getSubject().size() > 0);
|
assertTrue(p.getSubject().size() > 0);
|
||||||
assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline()));
|
assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline()));
|
||||||
assertTrue(StringUtils.isNotBlank(p.getJournal().getName()));
|
assertTrue(StringUtils.isNotBlank(p.getJournal().getName()));
|
||||||
assertTrue(p.getInstance().size() > 0);
|
|
||||||
|
|
||||||
|
assertNotNull(p.getInstance());
|
||||||
|
assertTrue(p.getInstance().size() > 0);
|
||||||
|
p
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.forEach(i -> {
|
||||||
|
assertNotNull(i.getAccessright());
|
||||||
|
assertEquals("OPEN", i.getAccessright().getClassid());
|
||||||
|
});
|
||||||
|
|
||||||
|
assertNotNull(p.getBestaccessright());
|
||||||
|
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||||
assertValidId(r1.getSource());
|
assertValidId(r1.getSource());
|
||||||
assertValidId(r1.getTarget());
|
assertValidId(r1.getTarget());
|
||||||
assertValidId(r2.getSource());
|
assertValidId(r2.getSource());
|
||||||
|
@ -121,13 +139,13 @@ public class MappersTest {
|
||||||
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
|
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
|
||||||
assertTrue(d.getAuthor().size() > 0);
|
assertTrue(d.getAuthor().size() > 0);
|
||||||
|
|
||||||
Optional<Author> author = d
|
final Optional<Author> author = d
|
||||||
.getAuthor()
|
.getAuthor()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
||||||
.findFirst();
|
.findFirst();
|
||||||
assertTrue(author.isPresent());
|
assertTrue(author.isPresent());
|
||||||
StructuredProperty pid = author
|
final StructuredProperty pid = author
|
||||||
.get()
|
.get()
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
|
@ -135,7 +153,7 @@ public class MappersTest {
|
||||||
.get();
|
.get();
|
||||||
assertEquals("0000-0001-9074-1619", pid.getValue());
|
assertEquals("0000-0001-9074-1619", pid.getValue());
|
||||||
assertEquals("ORCID", pid.getQualifier().getClassid());
|
assertEquals("ORCID", pid.getQualifier().getClassid());
|
||||||
assertEquals("ORCID", pid.getQualifier().getClassname());
|
assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname());
|
||||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
||||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
||||||
assertEquals("Baracchini, Theo", author.get().getFullname());
|
assertEquals("Baracchini, Theo", author.get().getFullname());
|
||||||
|
@ -143,13 +161,13 @@ public class MappersTest {
|
||||||
assertEquals("Theo", author.get().getName());
|
assertEquals("Theo", author.get().getName());
|
||||||
|
|
||||||
assertEquals(1, author.get().getAffiliation().size());
|
assertEquals(1, author.get().getAffiliation().size());
|
||||||
Optional<Field<String>> opAff = author
|
final Optional<Field<String>> opAff = author
|
||||||
.get()
|
.get()
|
||||||
.getAffiliation()
|
.getAffiliation()
|
||||||
.stream()
|
.stream()
|
||||||
.findFirst();
|
.findFirst();
|
||||||
assertTrue(opAff.isPresent());
|
assertTrue(opAff.isPresent());
|
||||||
Field<String> affiliation = opAff.get();
|
final Field<String> affiliation = opAff.get();
|
||||||
assertEquals("ISTI-CNR", affiliation.getValue());
|
assertEquals("ISTI-CNR", affiliation.getValue());
|
||||||
|
|
||||||
assertTrue(d.getSubject().size() > 0);
|
assertTrue(d.getSubject().size() > 0);
|
||||||
|
@ -157,6 +175,16 @@ public class MappersTest {
|
||||||
assertTrue(d.getContext().size() > 0);
|
assertTrue(d.getContext().size() > 0);
|
||||||
assertTrue(d.getContext().get(0).getId().length() > 0);
|
assertTrue(d.getContext().get(0).getId().length() > 0);
|
||||||
|
|
||||||
|
assertNotNull(d.getInstance());
|
||||||
|
assertTrue(d.getInstance().size() > 0);
|
||||||
|
d
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.forEach(i -> {
|
||||||
|
assertNotNull(i.getAccessright());
|
||||||
|
assertEquals("OPEN", i.getAccessright().getClassid());
|
||||||
|
});
|
||||||
|
|
||||||
assertValidId(r1.getSource());
|
assertValidId(r1.getSource());
|
||||||
assertValidId(r1.getTarget());
|
assertValidId(r1.getTarget());
|
||||||
assertValidId(r2.getSource());
|
assertValidId(r2.getSource());
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,9 @@ package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -19,8 +21,10 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Iterators;
|
import com.google.common.collect.Iterators;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
@ -58,6 +62,8 @@ public class PrepareRelationsJob {
|
||||||
|
|
||||||
public static final int MAX_RELS = 100;
|
public static final int MAX_RELS = 100;
|
||||||
|
|
||||||
|
public static final int DEFAULT_NUM_PARTITIONS = 3000;
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
String jsonConfiguration = IOUtils
|
String jsonConfiguration = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
|
@ -79,6 +85,24 @@ public class PrepareRelationsJob {
|
||||||
String outputPath = parser.get("outputPath");
|
String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
int relPartitions = Optional
|
||||||
|
.ofNullable(parser.get("relPartitions"))
|
||||||
|
.map(Integer::valueOf)
|
||||||
|
.orElse(DEFAULT_NUM_PARTITIONS);
|
||||||
|
log.info("relPartitions: {}", relPartitions);
|
||||||
|
|
||||||
|
Set<String> relationFilter = Optional
|
||||||
|
.ofNullable(parser.get("relationFilter"))
|
||||||
|
.map(s -> Sets.newHashSet(Splitter.on(",").split(s)))
|
||||||
|
.orElse(new HashSet<>());
|
||||||
|
log.info("relationFilter: {}", relationFilter);
|
||||||
|
|
||||||
|
int maxRelations = Optional
|
||||||
|
.ofNullable(parser.get("maxRelations"))
|
||||||
|
.map(Integer::valueOf)
|
||||||
|
.orElse(MAX_RELS);
|
||||||
|
log.info("maxRelations: {}", maxRelations);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
|
@ -86,25 +110,74 @@ public class PrepareRelationsJob {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
removeOutputDir(spark, outputPath);
|
removeOutputDir(spark, outputPath);
|
||||||
prepareRelationsFromPaths(spark, inputRelationsPath, outputPath);
|
prepareRelationsRDD(
|
||||||
|
spark, inputRelationsPath, outputPath, relationFilter, relPartitions, maxRelations);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void prepareRelationsFromPaths(
|
/**
|
||||||
SparkSession spark, String inputRelationsPath, String outputPath) {
|
* Dataset based implementation that prepares the graph relations by limiting the number of outgoing links and
|
||||||
|
* filtering the relation types according to the given criteria.
|
||||||
|
*
|
||||||
|
* @param spark the spark session
|
||||||
|
* @param inputRelationsPath source path for the graph relations
|
||||||
|
* @param outputPath output path for the processed relations
|
||||||
|
* @param relationFilter set of relation filters applied to the `relClass` field
|
||||||
|
* @param maxRelations maximum number of allowed outgoing edges
|
||||||
|
*/
|
||||||
|
private static void prepareRelations(
|
||||||
|
SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter,
|
||||||
|
int maxRelations) {
|
||||||
readPathRelation(spark, inputRelationsPath)
|
readPathRelation(spark, inputRelationsPath)
|
||||||
.filter("dataInfo.deletedbyinference == false")
|
.filter("dataInfo.deletedbyinference == false")
|
||||||
|
.filter((FilterFunction<SortableRelation>) rel -> !relationFilter.contains(rel.getRelClass()))
|
||||||
.groupByKey(
|
.groupByKey(
|
||||||
(MapFunction<SortableRelation, String>) value -> value.getSource(), Encoders.STRING())
|
(MapFunction<SortableRelation, String>) value -> value.getSource(), Encoders.STRING())
|
||||||
.flatMapGroups(
|
.flatMapGroups(
|
||||||
(FlatMapGroupsFunction<String, SortableRelation, SortableRelation>) (key, values) -> Iterators
|
(FlatMapGroupsFunction<String, SortableRelation, SortableRelation>) (key, values) -> Iterators
|
||||||
.limit(values, MAX_RELS),
|
.limit(values, maxRelations),
|
||||||
Encoders.bean(SortableRelation.class))
|
Encoders.bean(SortableRelation.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.parquet(outputPath);
|
.parquet(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RDD based implementation that prepares the graph relations by limiting the number of outgoing links and filtering
|
||||||
|
* the relation types according to the given criteria. Moreover, outgoing links kept within the given limit are
|
||||||
|
* prioritized according to the weights indicated in eu.dnetlib.dhp.oa.provision.model.SortableRelation.
|
||||||
|
*
|
||||||
|
* @param spark the spark session
|
||||||
|
* @param inputRelationsPath source path for the graph relations
|
||||||
|
* @param outputPath output path for the processed relations
|
||||||
|
* @param relationFilter set of relation filters applied to the `relClass` field
|
||||||
|
* @param maxRelations maximum number of allowed outgoing edges
|
||||||
|
*/
|
||||||
|
// TODO work in progress
|
||||||
|
private static void prepareRelationsRDD(
|
||||||
|
SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter, int relPartitions,
|
||||||
|
int maxRelations) {
|
||||||
|
JavaRDD<SortableRelation> rels = readPathRelationRDD(spark, inputRelationsPath).repartition(relPartitions);
|
||||||
|
RelationPartitioner partitioner = new RelationPartitioner(rels.getNumPartitions());
|
||||||
|
|
||||||
|
// only consider those that are not virtually deleted
|
||||||
|
RDD<SortableRelation> d = rels
|
||||||
|
.filter(rel -> !rel.getDataInfo().getDeletedbyinference())
|
||||||
|
.filter(rel -> !relationFilter.contains(rel.getRelClass()))
|
||||||
|
.mapToPair(
|
||||||
|
(PairFunction<SortableRelation, SortableRelation, SortableRelation>) rel -> new Tuple2<>(rel, rel))
|
||||||
|
.groupByKey(partitioner)
|
||||||
|
.map(group -> Iterables.limit(group._2(), maxRelations))
|
||||||
|
.flatMap(group -> group.iterator())
|
||||||
|
.rdd();
|
||||||
|
|
||||||
|
spark
|
||||||
|
.createDataset(d, Encoders.bean(SortableRelation.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.parquet(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
|
* Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
|
||||||
* file,
|
* file,
|
||||||
|
@ -123,31 +196,6 @@ public class PrepareRelationsJob {
|
||||||
Encoders.bean(SortableRelation.class));
|
Encoders.bean(SortableRelation.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO work in progress
|
|
||||||
private static void prepareRelationsRDDFromPaths(
|
|
||||||
SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) {
|
|
||||||
JavaRDD<SortableRelation> rels = readPathRelationRDD(spark, inputRelationsPath).repartition(numPartitions);
|
|
||||||
|
|
||||||
RDD<SortableRelation> d = rels
|
|
||||||
.filter(rel -> !rel.getDataInfo().getDeletedbyinference()) // only
|
|
||||||
// consider
|
|
||||||
// those
|
|
||||||
// that are not virtually
|
|
||||||
// deleted
|
|
||||||
.mapToPair(
|
|
||||||
(PairFunction<SortableRelation, SortableRelation, SortableRelation>) rel -> new Tuple2<>(rel, rel))
|
|
||||||
.groupByKey(new RelationPartitioner(rels.getNumPartitions()))
|
|
||||||
.map(p -> Iterables.limit(p._2(), MAX_RELS))
|
|
||||||
.flatMap(p -> p.iterator())
|
|
||||||
.rdd();
|
|
||||||
|
|
||||||
spark
|
|
||||||
.createDataset(d, Encoders.bean(SortableRelation.class))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.parquet(outputPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static JavaRDD<SortableRelation> readPathRelationRDD(
|
private static JavaRDD<SortableRelation> readPathRelationRDD(
|
||||||
SparkSession spark, final String inputPath) {
|
SparkSession spark, final String inputPath) {
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
|
@ -16,10 +16,10 @@ public class SortableRelation extends Relation implements Comparable<Relation>,
|
||||||
static {
|
static {
|
||||||
weights.put("outcome", 0);
|
weights.put("outcome", 0);
|
||||||
weights.put("supplement", 1);
|
weights.put("supplement", 1);
|
||||||
weights.put("publicationDataset", 2);
|
weights.put("affiliation", 2);
|
||||||
weights.put("relationship", 3);
|
weights.put("relationship", 3);
|
||||||
weights.put("similarity", 4);
|
weights.put("publicationDataset", 4);
|
||||||
weights.put("affiliation", 5);
|
weights.put("similarity", 5);
|
||||||
|
|
||||||
weights.put("provision", 6);
|
weights.put("provision", 6);
|
||||||
weights.put("participation", 7);
|
weights.put("participation", 7);
|
||||||
|
|
|
@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
|
||||||
import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml;
|
import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -95,7 +96,7 @@ public class TemplateFactory {
|
||||||
.add("metadata", instancemetadata)
|
.add("metadata", instancemetadata)
|
||||||
.add(
|
.add(
|
||||||
"webresources",
|
"webresources",
|
||||||
webresources
|
(webresources != null ? webresources : new ArrayList<String>())
|
||||||
.stream()
|
.stream()
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
.map(w -> getWebResource(w))
|
.map(w -> getWebResource(w))
|
||||||
|
|
|
@ -174,6 +174,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
entity
|
entity
|
||||||
.getCollectedfrom()
|
.getCollectedfrom()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(XmlRecordFactory::kvNotBlank)
|
||||||
.map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv))
|
.map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -183,6 +184,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
entity
|
entity
|
||||||
.getOriginalId()
|
.getOriginalId()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(s -> XmlSerializationUtils.asXmlElement("originalId", s))
|
.map(s -> XmlSerializationUtils.asXmlElement("originalId", s))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -192,6 +194,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
entity
|
entity
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p))
|
.map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -213,6 +216,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getTitle()
|
.getTitle()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(t -> XmlSerializationUtils.mapStructuredProperty("title", t))
|
.map(t -> XmlSerializationUtils.mapStructuredProperty("title", t))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -225,6 +229,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getAuthor()
|
.getAuthor()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(
|
.map(
|
||||||
a -> {
|
a -> {
|
||||||
final StringBuilder sb = new StringBuilder("<creator rank=\"" + a.getRank() + "\"");
|
final StringBuilder sb = new StringBuilder("<creator rank=\"" + a.getRank() + "\"");
|
||||||
|
@ -240,24 +245,26 @@ public class XmlRecordFactory implements Serializable {
|
||||||
a
|
a
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.filter(
|
.filter(
|
||||||
sp -> isNotBlank(sp.getQualifier().getClassid())
|
sp -> isNotBlank(sp.getQualifier().getClassid())
|
||||||
&& isNotBlank(sp.getValue()))
|
&& isNotBlank(sp.getValue()))
|
||||||
|
.collect(
|
||||||
|
Collectors
|
||||||
|
.toMap(
|
||||||
|
p -> getAuthorPidType(p.getQualifier().getClassid()),
|
||||||
|
p -> p,
|
||||||
|
(p1, p2) -> p1))
|
||||||
|
.values()
|
||||||
.forEach(
|
.forEach(
|
||||||
sp -> {
|
sp -> {
|
||||||
String pidType = XmlSerializationUtils
|
String pidType = getAuthorPidType(sp.getQualifier().getClassid());
|
||||||
.escapeXml(
|
|
||||||
sp.getQualifier().getClassid())
|
|
||||||
.replaceAll("\\W", "");
|
|
||||||
String pidValue = XmlSerializationUtils.escapeXml(sp.getValue());
|
String pidValue = XmlSerializationUtils.escapeXml(sp.getValue());
|
||||||
|
|
||||||
// ugly hack: some records
|
// ugly hack: some records provide swapped pidtype and pidvalue
|
||||||
// provide swapped pidtype and
|
|
||||||
// pidvalue
|
|
||||||
if (authorPidTypes.contains(pidValue.toLowerCase().trim())) {
|
if (authorPidTypes.contains(pidValue.toLowerCase().trim())) {
|
||||||
sb.append(String.format(" %s=\"%s\"", pidValue, pidType));
|
sb.append(String.format(" %s=\"%s\"", pidValue, pidType));
|
||||||
} else {
|
} else {
|
||||||
pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", "");
|
|
||||||
if (isNotBlank(pidType)) {
|
if (isNotBlank(pidType)) {
|
||||||
sb
|
sb
|
||||||
.append(
|
.append(
|
||||||
|
@ -285,6 +292,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getContributor()
|
.getContributor()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("contributor", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("contributor", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -294,6 +302,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getCountry()
|
.getCountry()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.mapQualifier("country", c))
|
.map(c -> XmlSerializationUtils.mapQualifier("country", c))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -303,6 +312,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getCoverage()
|
.getCoverage()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("coverage", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("coverage", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -319,6 +329,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getDescription()
|
.getDescription()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -333,6 +344,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getSubject()
|
.getSubject()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s))
|
.map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -345,6 +357,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getRelevantdate()
|
.getRelevantdate()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(s -> XmlSerializationUtils.mapStructuredProperty("relevantdate", s))
|
.map(s -> XmlSerializationUtils.mapStructuredProperty("relevantdate", s))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -357,6 +370,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getSource()
|
.getSource()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -366,6 +380,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
r
|
r
|
||||||
.getFormat()
|
.getFormat()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("format", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("format", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -429,6 +444,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
orp
|
orp
|
||||||
.getContactperson()
|
.getContactperson()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("contactperson", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("contactperson", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -439,6 +455,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
orp
|
orp
|
||||||
.getContactgroup()
|
.getContactgroup()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("contactgroup", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("contactgroup", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -448,6 +465,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
orp
|
orp
|
||||||
.getTool()
|
.getTool()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("tool", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("tool", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -461,6 +479,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
s
|
s
|
||||||
.getDocumentationUrl()
|
.getDocumentationUrl()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("documentationUrl", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("documentationUrl", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -470,6 +489,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
s
|
s
|
||||||
.getLicense()
|
.getLicense()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(l -> XmlSerializationUtils.mapStructuredProperty("license", l))
|
.map(l -> XmlSerializationUtils.mapStructuredProperty("license", l))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -576,6 +596,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
ds
|
ds
|
||||||
.getOdlanguages()
|
.getOdlanguages()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -585,6 +606,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
ds
|
ds
|
||||||
.getOdcontenttypes()
|
.getOdcontenttypes()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("odcontenttypes", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("odcontenttypes", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -697,6 +719,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
ds
|
ds
|
||||||
.getPolicies()
|
.getPolicies()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(XmlRecordFactory::kvNotBlank)
|
||||||
.map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv))
|
.map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -709,6 +732,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
ds
|
ds
|
||||||
.getSubjects()
|
.getSubjects()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp))
|
.map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -735,6 +759,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
o
|
o
|
||||||
.getAlternativeNames()
|
.getAlternativeNames()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(c -> XmlSerializationUtils.asXmlElement("alternativeNames", c.getValue()))
|
.map(c -> XmlSerializationUtils.asXmlElement("alternativeNames", c.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -744,7 +769,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
XmlSerializationUtils.asXmlElement("websiteurl", o.getWebsiteurl().getValue()));
|
XmlSerializationUtils.asXmlElement("websiteurl", o.getWebsiteurl().getValue()));
|
||||||
}
|
}
|
||||||
if (o.getLogourl() != null) {
|
if (o.getLogourl() != null) {
|
||||||
metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", o.getLogourl().getValue()));
|
metadata.add(XmlSerializationUtils.asXmlElement("logourl", o.getLogourl().getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (o.getEclegalbody() != null) {
|
if (o.getEclegalbody() != null) {
|
||||||
|
@ -776,13 +801,13 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.asXmlElement(
|
.asXmlElement(
|
||||||
"echighereducation", o.getEchighereducation().getValue()));
|
"echighereducation", o.getEchighereducation().getValue()));
|
||||||
}
|
}
|
||||||
if (o.getEcinternationalorganization() != null) {
|
if (o.getEcinternationalorganizationeurinterests() != null) {
|
||||||
metadata
|
metadata
|
||||||
.add(
|
.add(
|
||||||
XmlSerializationUtils
|
XmlSerializationUtils
|
||||||
.asXmlElement(
|
.asXmlElement(
|
||||||
"ecinternationalorganizationeurinterests",
|
"ecinternationalorganizationeurinterests",
|
||||||
o.getEcinternationalorganization().getValue()));
|
o.getEcinternationalorganizationeurinterests().getValue()));
|
||||||
}
|
}
|
||||||
if (o.getEcinternationalorganization() != null) {
|
if (o.getEcinternationalorganization() != null) {
|
||||||
metadata
|
metadata
|
||||||
|
@ -862,6 +887,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
p
|
p
|
||||||
.getSubjects()
|
.getSubjects()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(sp -> XmlSerializationUtils.mapStructuredProperty("subject", sp))
|
.map(sp -> XmlSerializationUtils.mapStructuredProperty("subject", sp))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -912,7 +938,12 @@ public class XmlRecordFactory implements Serializable {
|
||||||
if (p.getFundingtree() != null) {
|
if (p.getFundingtree() != null) {
|
||||||
metadata
|
metadata
|
||||||
.addAll(
|
.addAll(
|
||||||
p.getFundingtree().stream().map(ft -> ft.getValue()).collect(Collectors.toList()));
|
p
|
||||||
|
.getFundingtree()
|
||||||
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.map(ft -> ft.getValue())
|
||||||
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
@ -923,6 +954,17 @@ public class XmlRecordFactory implements Serializable {
|
||||||
return metadata;
|
return metadata;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getAuthorPidType(String s) {
|
||||||
|
return XmlSerializationUtils
|
||||||
|
.escapeXml(s)
|
||||||
|
.replaceAll("\\W", "")
|
||||||
|
.replaceAll("\\d", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean kvNotBlank(KeyValue kv) {
|
||||||
|
return kv != null && StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
private void mapDatasourceType(List<String> metadata, final Qualifier dsType) {
|
private void mapDatasourceType(List<String> metadata, final Qualifier dsType) {
|
||||||
metadata.add(XmlSerializationUtils.mapQualifier("datasourcetype", dsType));
|
metadata.add(XmlSerializationUtils.mapQualifier("datasourcetype", dsType));
|
||||||
|
|
||||||
|
@ -960,7 +1002,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.add(
|
.add(
|
||||||
XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
|
XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
|
||||||
}
|
}
|
||||||
if (re.getResulttype() != null & re.getResulttype().isBlank()) {
|
if (re.getResulttype() != null && re.getResulttype().isBlank()) {
|
||||||
metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype()));
|
metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype()));
|
||||||
}
|
}
|
||||||
if (re.getCollectedfrom() != null) {
|
if (re.getCollectedfrom() != null) {
|
||||||
|
@ -969,6 +1011,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
re
|
re
|
||||||
.getCollectedfrom()
|
.getCollectedfrom()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(XmlRecordFactory::kvNotBlank)
|
||||||
.map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv))
|
.map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -986,10 +1029,10 @@ public class XmlRecordFactory implements Serializable {
|
||||||
if (isNotBlank(re.getOfficialname())) {
|
if (isNotBlank(re.getOfficialname())) {
|
||||||
metadata.add(XmlSerializationUtils.asXmlElement("officialname", re.getOfficialname()));
|
metadata.add(XmlSerializationUtils.asXmlElement("officialname", re.getOfficialname()));
|
||||||
}
|
}
|
||||||
if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) {
|
if (re.getDatasourcetype() != null && !re.getDatasourcetype().isBlank()) {
|
||||||
mapDatasourceType(metadata, re.getDatasourcetype());
|
mapDatasourceType(metadata, re.getDatasourcetype());
|
||||||
}
|
}
|
||||||
if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) {
|
if (re.getOpenairecompatibility() != null && !re.getOpenairecompatibility().isBlank()) {
|
||||||
metadata
|
metadata
|
||||||
.add(
|
.add(
|
||||||
XmlSerializationUtils
|
XmlSerializationUtils
|
||||||
|
@ -1006,7 +1049,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.add(
|
.add(
|
||||||
XmlSerializationUtils.asXmlElement("legalshortname", re.getLegalshortname()));
|
XmlSerializationUtils.asXmlElement("legalshortname", re.getLegalshortname()));
|
||||||
}
|
}
|
||||||
if (re.getCountry() != null & !re.getCountry().isBlank()) {
|
if (re.getCountry() != null && !re.getCountry().isBlank()) {
|
||||||
metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry()));
|
metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry()));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -1020,10 +1063,10 @@ public class XmlRecordFactory implements Serializable {
|
||||||
if (isNotBlank(re.getAcronym())) {
|
if (isNotBlank(re.getAcronym())) {
|
||||||
metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym()));
|
metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym()));
|
||||||
}
|
}
|
||||||
if (re.getContracttype() != null & !re.getContracttype().isBlank()) {
|
if (re.getContracttype() != null && !re.getContracttype().isBlank()) {
|
||||||
metadata.add(XmlSerializationUtils.mapQualifier("contracttype", re.getContracttype()));
|
metadata.add(XmlSerializationUtils.mapQualifier("contracttype", re.getContracttype()));
|
||||||
}
|
}
|
||||||
if (re.getFundingtree() != null & contexts != null) {
|
if (re.getFundingtree() != null && contexts != null) {
|
||||||
metadata
|
metadata
|
||||||
.addAll(
|
.addAll(
|
||||||
re
|
re
|
||||||
|
@ -1091,12 +1134,12 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.add(
|
.add(
|
||||||
XmlSerializationUtils.mapQualifier("accessright", instance.getAccessright()));
|
XmlSerializationUtils.mapQualifier("accessright", instance.getAccessright()));
|
||||||
}
|
}
|
||||||
if (instance.getCollectedfrom() != null) {
|
if (instance.getCollectedfrom() != null && kvNotBlank(instance.getCollectedfrom())) {
|
||||||
fields
|
fields
|
||||||
.add(
|
.add(
|
||||||
XmlSerializationUtils.mapKeyValue("collectedfrom", instance.getCollectedfrom()));
|
XmlSerializationUtils.mapKeyValue("collectedfrom", instance.getCollectedfrom()));
|
||||||
}
|
}
|
||||||
if (instance.getHostedby() != null) {
|
if (instance.getHostedby() != null && kvNotBlank(instance.getHostedby())) {
|
||||||
fields.add(XmlSerializationUtils.mapKeyValue("hostedby", instance.getHostedby()));
|
fields.add(XmlSerializationUtils.mapKeyValue("hostedby", instance.getHostedby()));
|
||||||
}
|
}
|
||||||
if (instance.getDateofacceptance() != null
|
if (instance.getDateofacceptance() != null
|
||||||
|
|
|
@ -21,6 +21,18 @@
|
||||||
"paramName": "rp",
|
"paramName": "rp",
|
||||||
"paramLongName": "relPartitions",
|
"paramLongName": "relPartitions",
|
||||||
"paramDescription": "number or partitions for the relations Dataset",
|
"paramDescription": "number or partitions for the relations Dataset",
|
||||||
"paramRequired": true
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rf",
|
||||||
|
"paramLongName": "relationFilter",
|
||||||
|
"paramDescription": "filter applied reading relations (by relClass)",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "mr",
|
||||||
|
"paramLongName": "maxRelations",
|
||||||
|
"paramDescription": "maximum number of relations allowed for a each entity",
|
||||||
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -9,6 +9,30 @@
|
||||||
<name>isLookupUrl</name>
|
<name>isLookupUrl</name>
|
||||||
<description>URL for the isLookup service</description>
|
<description>URL for the isLookup service</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>relPartitions</name>
|
||||||
|
<description>number or partitions for the relations Dataset</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>relationFilter</name>
|
||||||
|
<description>filter applied reading relations (by relClass)</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>maxRelations</name>
|
||||||
|
<description>maximum number of relations allowed for a each entity</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>otherDsTypeId</name>
|
||||||
|
<description>mapping used to populate datasourceTypeUi field</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>format</name>
|
||||||
|
<description>metadata format name (DMF|TMF)</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>batchSize</name>
|
||||||
|
<description>number of records to be included in each indexing request</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemoryForJoining</name>
|
<name>sparkDriverMemoryForJoining</name>
|
||||||
|
@ -56,6 +80,10 @@
|
||||||
<name>spark2EventLogDir</name>
|
<name>spark2EventLogDir</name>
|
||||||
<description>spark 2.* event log dir location</description>
|
<description>spark 2.* event log dir location</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkNetworkTimeout</name>
|
||||||
|
<description>configures spark.network.timeout</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<global>
|
<global>
|
||||||
|
@ -69,12 +97,16 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="reuse_records"/>
|
<start to="resume_from"/>
|
||||||
|
|
||||||
<decision name="reuse_records">
|
<decision name="resume_from">
|
||||||
<switch>
|
<switch>
|
||||||
<case to="prepare_relations">${wf:conf('reuseRecords') eq false}</case>
|
<case to="prepare_relations">${wf:conf('resumeFrom') eq 'prepare_relations'}</case>
|
||||||
<case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
|
<case to="fork_join_related_entities">${wf:conf('resumeFrom') eq 'fork_join_related_entities'}</case>
|
||||||
|
<case to="join_all_entities">${wf:conf('resumeFrom') eq 'join_all_entities'}</case>
|
||||||
|
<case to="adjancency_lists">${wf:conf('resumeFrom') eq 'adjancency_lists'}</case>
|
||||||
|
<case to="convert_to_xml">${wf:conf('resumeFrom') eq 'convert_to_xml'}</case>
|
||||||
|
<case to="to_solr_index">${wf:conf('resumeFrom') eq 'to_solr_index'}</case>
|
||||||
<default to="prepare_relations"/>
|
<default to="prepare_relations"/>
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
@ -309,7 +341,6 @@
|
||||||
|
|
||||||
<join name="wait_joins" to="join_all_entities"/>
|
<join name="wait_joins" to="join_all_entities"/>
|
||||||
|
|
||||||
|
|
||||||
<action name="join_all_entities">
|
<action name="join_all_entities">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -419,4 +450,5 @@
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<artifactId>dhp-stats-update</artifactId>
|
<artifactId>dhp-stats-update</artifactId>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
<relativePath>../</relativePath>
|
<relativePath>../</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|
2
pom.xml
2
pom.xml
|
@ -3,7 +3,7 @@
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
<licenses>
|
<licenses>
|
||||||
|
|
Loading…
Reference in New Issue