forked from D-Net/dnet-hadoop
Merge pull request 'master' (#13) from miriam.baglioni/dnet-hadoop:master into enrichment_wfs
This commit is contained in:
commit
e87eca9300
|
@ -6,7 +6,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-build</artifactId>
|
<artifactId>dhp-build</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-build</artifactId>
|
<artifactId>dhp-build</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
|
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-code-style</artifactId>
|
<artifactId>dhp-code-style</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
|
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>dhp-build</artifactId>
|
<artifactId>dhp-build</artifactId>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
<relativePath>../</relativePath>
|
<relativePath>../</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
|
@ -5,7 +5,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
<relativePath>../</relativePath>
|
<relativePath>../</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,69 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.common;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
|
||||||
|
public class LicenseComparator implements Comparator<Qualifier> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(Qualifier left, Qualifier right) {
|
||||||
|
|
||||||
|
if (left == null && right == null)
|
||||||
|
return 0;
|
||||||
|
if (left == null)
|
||||||
|
return 1;
|
||||||
|
if (right == null)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
String lClass = left.getClassid();
|
||||||
|
String rClass = right.getClassid();
|
||||||
|
|
||||||
|
if (lClass.equals(rClass))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (lClass.equals("OPEN SOURCE"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("OPEN SOURCE"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("OPEN"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("OPEN"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("6MONTHS"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("6MONTHS"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("12MONTHS"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("12MONTHS"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("EMBARGO"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("EMBARGO"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("RESTRICTED"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("RESTRICTED"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("CLOSED"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("CLOSED"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals("UNKNOWN"))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals("UNKNOWN"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
// Else (but unlikely), lexicographical ordering will do.
|
||||||
|
return lClass.compareTo(rClass);
|
||||||
|
}
|
||||||
|
}
|
|
@ -13,6 +13,7 @@ public class ModelConstants {
|
||||||
public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date";
|
public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date";
|
||||||
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
|
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
|
||||||
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
|
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
|
||||||
|
public static final String DNET_COUNTRY_TYPE = "dnet:countries";
|
||||||
|
|
||||||
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
|
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
|
||||||
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
|
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
|
||||||
|
@ -49,6 +50,13 @@ public class ModelConstants {
|
||||||
public static final String HAS_PARTICIPANT = "hasParticipant";
|
public static final String HAS_PARTICIPANT = "hasParticipant";
|
||||||
public static final String IS_PARTICIPANT = "isParticipant";
|
public static final String IS_PARTICIPANT = "isParticipant";
|
||||||
|
|
||||||
|
public static final String RESULT_ORGANIZATION = "resultOrganization";
|
||||||
|
public static final String AFFILIATION = "affiliation";
|
||||||
|
public static final String IS_AUTHOR_INSTITUTION_OF = "isAuthorInstitutionOf";
|
||||||
|
public static final String HAS_AUTHOR_INSTITUTION = "hasAuthorInstitution";
|
||||||
|
|
||||||
|
public static final String MERGES = "merges";
|
||||||
|
|
||||||
public static final String UNKNOWN = "UNKNOWN";
|
public static final String UNKNOWN = "UNKNOWN";
|
||||||
public static final String NOT_AVAILABLE = "not available";
|
public static final String NOT_AVAILABLE = "not available";
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.common;
|
package eu.dnetlib.dhp.schema.common;
|
||||||
|
|
||||||
|
import static com.google.common.base.Preconditions.checkArgument;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
@ -379,6 +384,21 @@ public class ModelSupport {
|
||||||
entityMapping.get(EntityType.valueOf(targetType)).name());
|
entityMapping.get(EntityType.valueOf(targetType)).name());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> String tableIdentifier(String dbName, String tableName) {
|
||||||
|
|
||||||
|
checkArgument(StringUtils.isNotBlank(dbName), "DB name cannot be empty");
|
||||||
|
checkArgument(StringUtils.isNotBlank(tableName), "table name cannot be empty");
|
||||||
|
|
||||||
|
return String.format("%s.%s", dbName, tableName);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> String tableIdentifier(String dbName, Class<T> clazz) {
|
||||||
|
|
||||||
|
checkArgument(Objects.nonNull(clazz), "clazz is needed to derive the table name, thus cannot be null");
|
||||||
|
|
||||||
|
return tableIdentifier(dbName, clazz.getSimpleName().toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> Function<T, String> idFn() {
|
public static <T extends Oaf> Function<T, String> idFn() {
|
||||||
return x -> {
|
return x -> {
|
||||||
if (isSubClass(x, Relation.class)) {
|
if (isSubClass(x, Relation.class)) {
|
||||||
|
|
|
@ -8,7 +8,7 @@ public class DataInfo implements Serializable {
|
||||||
|
|
||||||
private Boolean invisible = false;
|
private Boolean invisible = false;
|
||||||
private Boolean inferred;
|
private Boolean inferred;
|
||||||
private Boolean deletedbyinference;
|
private Boolean deletedbyinference = false;
|
||||||
private String trust;
|
private String trust;
|
||||||
private String inferenceprovenance;
|
private String inferenceprovenance;
|
||||||
private Qualifier provenanceaction;
|
private Qualifier provenanceaction;
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class Field<T> implements Serializable {
|
public class Field<T> implements Serializable {
|
||||||
|
|
||||||
|
@ -39,6 +40,6 @@ public class Field<T> implements Serializable {
|
||||||
if (getClass() != obj.getClass())
|
if (getClass() != obj.getClass())
|
||||||
return false;
|
return false;
|
||||||
Field<T> other = (Field<T>) obj;
|
Field<T> other = (Field<T>) obj;
|
||||||
return getValue().equals(other.getValue());
|
return Objects.equals(getValue(), other.getValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,6 +106,7 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
||||||
.stream(lists)
|
.stream(lists)
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.flatMap(List::stream)
|
.flatMap(List::stream)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.distinct()
|
.distinct()
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
|
@ -244,7 +244,25 @@ public class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
subject = mergeLists(subject, r.getSubject());
|
subject = mergeLists(subject, r.getSubject());
|
||||||
|
|
||||||
|
//merge title lists: main title with higher trust and distinct between the others
|
||||||
|
StructuredProperty baseMainTitle = null;
|
||||||
|
if(title != null) {
|
||||||
|
baseMainTitle = getMainTitle(title);
|
||||||
|
title.remove(baseMainTitle);
|
||||||
|
}
|
||||||
|
|
||||||
|
StructuredProperty newMainTitle = null;
|
||||||
|
if(r.getTitle() != null) {
|
||||||
|
newMainTitle = getMainTitle(r.getTitle());
|
||||||
|
r.getTitle().remove(newMainTitle);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (newMainTitle != null && compareTrust(this, r) < 0 )
|
||||||
|
baseMainTitle = newMainTitle;
|
||||||
|
|
||||||
title = mergeLists(title, r.getTitle());
|
title = mergeLists(title, r.getTitle());
|
||||||
|
if (title != null && baseMainTitle != null)
|
||||||
|
title.add(baseMainTitle);
|
||||||
|
|
||||||
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
|
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
|
||||||
|
|
||||||
|
@ -294,4 +312,14 @@ public class Result extends OafEntity implements Serializable {
|
||||||
}
|
}
|
||||||
return a.size() > b.size() ? a : b;
|
return a.size() > b.size() ? a : b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private StructuredProperty getMainTitle(List<StructuredProperty> titles) {
|
||||||
|
//need to check if the list of titles contains more than 1 main title? (in that case, we should chose which main title select in the list)
|
||||||
|
for (StructuredProperty title: titles) {
|
||||||
|
if (title.getQualifier() != null && title.getQualifier().getClassid() != null)
|
||||||
|
if (title.getQualifier().getClassid().equals("main title"))
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>dhp-actionmanager</artifactId>
|
<artifactId>dhp-actionmanager</artifactId>
|
||||||
|
|
||||||
|
|
|
@ -523,7 +523,9 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Context mapContext(ResultProtos.Result.Context context) {
|
private static Context mapContext(ResultProtos.Result.Context context) {
|
||||||
|
if (context == null || StringUtils.isBlank(context.getId())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
final Context entity = new Context();
|
final Context entity = new Context();
|
||||||
entity.setId(context.getId());
|
entity.setId(context.getId());
|
||||||
entity
|
entity
|
||||||
|
@ -537,6 +539,10 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
|
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
|
||||||
|
if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final KeyValue keyValue = new KeyValue();
|
final KeyValue keyValue = new KeyValue();
|
||||||
keyValue.setKey(kv.getKey());
|
keyValue.setKey(kv.getKey());
|
||||||
keyValue.setValue(kv.getValue());
|
keyValue.setValue(kv.getValue());
|
||||||
|
@ -575,6 +581,10 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
|
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
|
||||||
|
if (sp == null | StringUtils.isBlank(sp.getValue())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final StructuredProperty structuredProperty = new StructuredProperty();
|
final StructuredProperty structuredProperty = new StructuredProperty();
|
||||||
structuredProperty.setValue(sp.getValue());
|
structuredProperty.setValue(sp.getValue());
|
||||||
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
|
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
|
||||||
|
@ -611,6 +621,10 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
|
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
|
||||||
|
if (s == null || StringUtils.isBlank(s.getValue())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final Field<String> stringField = new Field<>();
|
final Field<String> stringField = new Field<>();
|
||||||
stringField.setValue(s.getValue());
|
stringField.setValue(s.getValue());
|
||||||
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
|
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
|
||||||
|
@ -618,19 +632,16 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
|
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
|
||||||
|
if (b == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final Field<Boolean> booleanField = new Field<>();
|
final Field<Boolean> booleanField = new Field<>();
|
||||||
booleanField.setValue(b.getValue());
|
booleanField.setValue(b.getValue());
|
||||||
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
|
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
|
||||||
return booleanField;
|
return booleanField;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Field<Integer> mapIntField(FieldTypeProtos.IntField b) {
|
|
||||||
final Field<Integer> entity = new Field<>();
|
|
||||||
entity.setValue(b.getValue());
|
|
||||||
entity.setDataInfo(mapDataInfo(b.getDataInfo()));
|
|
||||||
return entity;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Journal mapJournal(FieldTypeProtos.Journal j) {
|
public static Journal mapJournal(FieldTypeProtos.Journal j) {
|
||||||
final Journal journal = new Journal();
|
final Journal journal = new Journal();
|
||||||
journal.setConferencedate(j.getConferencedate());
|
journal.setConferencedate(j.getConferencedate());
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>dhp-aggregation</artifactId>
|
<artifactId>dhp-aggregation</artifactId>
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,9 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
public class PrepareMergedRelationJob {
|
public class PrepareMergedRelationJob {
|
||||||
|
@ -56,6 +57,7 @@ public class PrepareMergedRelationJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
selectMergesRelations(
|
selectMergesRelations(
|
||||||
spark,
|
spark,
|
||||||
inputPath,
|
inputPath,
|
||||||
|
@ -84,4 +86,9 @@ public class PrepareMergedRelationJob {
|
||||||
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
|
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void removeOutputDir(SparkSession spark, String path) {
|
||||||
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
@ -62,6 +63,7 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
removeBlacklistedRelations(
|
removeBlacklistedRelations(
|
||||||
spark,
|
spark,
|
||||||
blacklistPath,
|
blacklistPath,
|
||||||
|
@ -69,7 +71,6 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
outputPath,
|
outputPath,
|
||||||
mergesPath);
|
mergesPath);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
|
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
|
||||||
|
@ -78,8 +79,6 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
||||||
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
||||||
|
|
||||||
log.info("InputRelationCount: {}", inputRelation.count());
|
|
||||||
|
|
||||||
Dataset<Relation> dedupSource = blackListed
|
Dataset<Relation> dedupSource = blackListed
|
||||||
.joinWith(
|
.joinWith(
|
||||||
mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
|
mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
|
||||||
|
@ -102,11 +101,6 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
return c._1();
|
return c._1();
|
||||||
}, Encoders.bean(Relation.class));
|
}, Encoders.bean(Relation.class));
|
||||||
|
|
||||||
dedupBL
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.json(blacklistPath + "/deduped");
|
|
||||||
|
|
||||||
inputRelation
|
inputRelation
|
||||||
.joinWith(
|
.joinWith(
|
||||||
dedupBL, (inputRelation
|
dedupBL, (inputRelation
|
||||||
|
@ -144,4 +138,8 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void removeOutputDir(SparkSession spark, String path) {
|
||||||
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
<workflow-app name="blacklisting" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="blacklist_relations" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>postgresURL</name>
|
<name>postgresURL</name>
|
||||||
|
@ -22,6 +22,25 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -49,8 +68,6 @@
|
||||||
|
|
||||||
<action name="copy_publication">
|
<action name="copy_publication">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/publication</arg>
|
<arg>${nameNode}/${sourcePath}/publication</arg>
|
||||||
<arg>${nameNode}/${outputPath}/publication</arg>
|
<arg>${nameNode}/${outputPath}/publication</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -60,8 +77,6 @@
|
||||||
|
|
||||||
<action name="copy_dataset">
|
<action name="copy_dataset">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
||||||
<arg>${nameNode}/${outputPath}/dataset</arg>
|
<arg>${nameNode}/${outputPath}/dataset</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -71,8 +86,6 @@
|
||||||
|
|
||||||
<action name="copy_orp">
|
<action name="copy_orp">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -82,8 +95,6 @@
|
||||||
|
|
||||||
<action name="copy_software">
|
<action name="copy_software">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/software</arg>
|
<arg>${nameNode}/${sourcePath}/software</arg>
|
||||||
<arg>${nameNode}/${outputPath}/software</arg>
|
<arg>${nameNode}/${outputPath}/software</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -93,8 +104,6 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -102,10 +111,8 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_project">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -113,10 +120,8 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasource">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -128,8 +133,6 @@
|
||||||
|
|
||||||
<action name="read_blacklist">
|
<action name="read_blacklist">
|
||||||
<java>
|
<java>
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB</main-class>
|
<main-class>eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB</main-class>
|
||||||
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
|
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
|
@ -156,6 +159,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg>
|
||||||
|
@ -180,6 +184,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
|
|
|
@ -61,12 +61,6 @@ public class BlackListTest {
|
||||||
spark.stop();
|
spark.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String outputPath =
|
|
||||||
* parser.get("outputPath"); log.info("outputPath {}: ", outputPath); final String blacklistPath =
|
|
||||||
* parser.get("hdfsPath"); log.info("blacklistPath {}: ", blacklistPath); final String mergesPath =
|
|
||||||
* parser.get("mergesPath"); log.info("mergesPath {}: ", mergesPath);
|
|
||||||
*/
|
|
||||||
@Test
|
@Test
|
||||||
public void noRemoveTest() throws Exception {
|
public void noRemoveTest() throws Exception {
|
||||||
SparkRemoveBlacklistedRelationJob
|
SparkRemoveBlacklistedRelationJob
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -57,7 +57,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib</groupId>
|
<groupId>eu.dnetlib</groupId>
|
||||||
<artifactId>dnet-openaire-broker-common</artifactId>
|
<artifactId>dnet-openaire-broker-common</artifactId>
|
||||||
<version>[1.0.0,2.0.0)</version>
|
<version>[2.0.0,3.0.0)</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
|
@ -29,31 +29,32 @@ public class EventFactory {
|
||||||
"yyyy-MM-dd"
|
"yyyy-MM-dd"
|
||||||
};
|
};
|
||||||
|
|
||||||
public static Event newBrokerEvent(final Result source, final Result target, final UpdateInfo<?> updateInfo) {
|
public static Event newBrokerEvent(final UpdateInfo<?> updateInfo) {
|
||||||
|
|
||||||
final long now = new Date().getTime();
|
final long now = new Date().getTime();
|
||||||
|
|
||||||
final Event res = new Event();
|
final Event res = new Event();
|
||||||
|
|
||||||
final Map<String, Object> map = createMapFromResult(target, source, updateInfo);
|
final Map<String, Object> map = createMapFromResult(updateInfo);
|
||||||
|
|
||||||
final String payload = createPayload(target, updateInfo);
|
final String payload = createPayload(updateInfo);
|
||||||
|
|
||||||
final String eventId = calculateEventId(
|
final String eventId = calculateEventId(
|
||||||
updateInfo.getTopic(), target.getOriginalId().get(0), updateInfo.getHighlightValueAsString());
|
updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0),
|
||||||
|
updateInfo.getHighlightValueAsString());
|
||||||
|
|
||||||
res.setEventId(eventId);
|
res.setEventId(eventId);
|
||||||
res.setProducerId(PRODUCER_ID);
|
res.setProducerId(PRODUCER_ID);
|
||||||
res.setPayload(payload);
|
res.setPayload(payload);
|
||||||
res.setMap(map);
|
res.setMap(map);
|
||||||
res.setTopic(updateInfo.getTopic());
|
res.setTopic(updateInfo.getTopicPath());
|
||||||
res.setCreationDate(now);
|
res.setCreationDate(now);
|
||||||
res.setExpiryDate(calculateExpiryDate(now));
|
res.setExpiryDate(calculateExpiryDate(now));
|
||||||
res.setInstantMessage(false);
|
res.setInstantMessage(false);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String createPayload(final Result result, final UpdateInfo<?> updateInfo) {
|
private static String createPayload(final UpdateInfo<?> updateInfo) {
|
||||||
final OpenAireEventPayload payload = new OpenAireEventPayload();
|
final OpenAireEventPayload payload = new OpenAireEventPayload();
|
||||||
// TODO
|
// TODO
|
||||||
|
|
||||||
|
@ -62,32 +63,34 @@ public class EventFactory {
|
||||||
return payload.toJSON();
|
return payload.toJSON();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Map<String, Object> createMapFromResult(final Result oaf, final Result source,
|
private static Map<String, Object> createMapFromResult(final UpdateInfo<?> updateInfo) {
|
||||||
final UpdateInfo<?> updateInfo) {
|
|
||||||
final Map<String, Object> map = new HashMap<>();
|
final Map<String, Object> map = new HashMap<>();
|
||||||
|
|
||||||
final List<KeyValue> collectedFrom = oaf.getCollectedfrom();
|
final Result source = updateInfo.getSource();
|
||||||
|
final Result target = updateInfo.getTarget();
|
||||||
|
|
||||||
|
final List<KeyValue> collectedFrom = target.getCollectedfrom();
|
||||||
if (collectedFrom.size() == 1) {
|
if (collectedFrom.size() == 1) {
|
||||||
map.put("target_datasource_id", collectedFrom.get(0).getKey());
|
map.put("target_datasource_id", collectedFrom.get(0).getKey());
|
||||||
map.put("target_datasource_name", collectedFrom.get(0).getValue());
|
map.put("target_datasource_name", collectedFrom.get(0).getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<String> ids = oaf.getOriginalId();
|
final List<String> ids = target.getOriginalId();
|
||||||
if (ids.size() > 0) {
|
if (ids.size() > 0) {
|
||||||
map.put("target_publication_id", ids.get(0));
|
map.put("target_publication_id", ids.get(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<StructuredProperty> titles = oaf.getTitle();
|
final List<StructuredProperty> titles = target.getTitle();
|
||||||
if (titles.size() > 0) {
|
if (titles.size() > 0) {
|
||||||
map.put("target_publication_title", titles.get(0));
|
map.put("target_publication_title", titles.get(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
final long date = parseDateTolong(oaf.getDateofacceptance().getValue());
|
final long date = parseDateTolong(target.getDateofacceptance().getValue());
|
||||||
if (date > 0) {
|
if (date > 0) {
|
||||||
map.put("target_dateofacceptance", date);
|
map.put("target_dateofacceptance", date);
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<StructuredProperty> subjects = oaf.getSubject();
|
final List<StructuredProperty> subjects = target.getSubject();
|
||||||
if (subjects.size() > 0) {
|
if (subjects.size() > 0) {
|
||||||
map
|
map
|
||||||
.put(
|
.put(
|
||||||
|
@ -95,7 +98,7 @@ public class EventFactory {
|
||||||
subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
|
subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<Author> authors = oaf.getAuthor();
|
final List<Author> authors = target.getAuthor();
|
||||||
if (authors.size() > 0) {
|
if (authors.size() > 0) {
|
||||||
map
|
map
|
||||||
.put(
|
.put(
|
||||||
|
|
|
@ -0,0 +1,66 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.model;
|
||||||
|
|
||||||
|
public enum Topic {
|
||||||
|
|
||||||
|
// ENRICHMENT MISSING
|
||||||
|
ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), ENRICH_MISSING_ABSTRACT(
|
||||||
|
"ENRICH/MISSING/ABSTRACT"), ENRICH_MISSING_PUBLICATION_DATE(
|
||||||
|
"ENRICH/MISSING/PUBLICATION_DATE"), ENRICH_MISSING_PID(
|
||||||
|
"ENRICH/MISSING/PID"), ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), ENRICH_MISSING_SOFTWARE(
|
||||||
|
"ENRICH/MISSING/SOFTWARE"), ENRICH_MISSING_SUBJECT_MESHEUROPMC(
|
||||||
|
"ENRICH/MISSING/SUBJECT/MESHEUROPMC"), ENRICH_MISSING_SUBJECT_ARXIV(
|
||||||
|
"ENRICH/MISSING/SUBJECT/ARXIV"), ENRICH_MISSING_SUBJECT_JEL(
|
||||||
|
"ENRICH/MISSING/SUBJECT/JEL"), ENRICH_MISSING_SUBJECT_DDC(
|
||||||
|
"ENRICH/MISSING/SUBJECT/DDC"), ENRICH_MISSING_SUBJECT_ACM(
|
||||||
|
"ENRICH/MISSING/SUBJECT/ACM"), ENRICH_MISSING_SUBJECT_RVK(
|
||||||
|
"ENRICH/MISSING/SUBJECT/RVK"), ENRICH_MISSING_AUTHOR_ORCID(
|
||||||
|
"ENRICH/MISSING/AUTHOR/ORCID"),
|
||||||
|
|
||||||
|
// ENRICHMENT MORE
|
||||||
|
ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT(
|
||||||
|
"ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT(
|
||||||
|
"ENRICH/MORE/PROJECT"), ENRICH_MORE_SOFTWARE("ENRICH/MORE/SOFTWARE"), ENRICH_MORE_SUBJECT_MESHEUROPMC(
|
||||||
|
"ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV(
|
||||||
|
"ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL(
|
||||||
|
"ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC(
|
||||||
|
"ENRICH/MORE/SUBJECT/DDC"), ENRICH_MORE_SUBJECT_ACM(
|
||||||
|
"ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"),
|
||||||
|
|
||||||
|
// ADDITION
|
||||||
|
ADD_BY_PROJECT("ADD/BY_PROJECT"),
|
||||||
|
|
||||||
|
// OTHER RELS
|
||||||
|
ENRICH_MISSING_PUBLICATION_IS_RELATED_TO(
|
||||||
|
"ENRICH/MISSING/PUBLICATION/IS_RELATED_TO"), ENRICH_MISSING_PUBLICATION_REFERENCES(
|
||||||
|
"ENRICH/MISSING/PUBLICATION/REFERENCES"), ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY(
|
||||||
|
"ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY"), ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO(
|
||||||
|
"ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO"), ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY(
|
||||||
|
"ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY"),
|
||||||
|
|
||||||
|
ENRICH_MISSING_DATASET_IS_RELATED_TO("ENRICH/MISSING/DATASET/IS_RELATED_TO"), ENRICH_MISSING_DATASET_REFERENCES(
|
||||||
|
"ENRICH/MISSING/DATASET/REFERENCES"), ENRICH_MISSING_DATASET_IS_REFERENCED_BY(
|
||||||
|
"ENRICH/MISSING/DATASET/IS_REFERENCED_BY"), ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO(
|
||||||
|
"ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO"), ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY(
|
||||||
|
"ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY"),;
|
||||||
|
|
||||||
|
Topic(final String path) {
|
||||||
|
this.path = path;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String path;
|
||||||
|
|
||||||
|
public String getPath() {
|
||||||
|
return this.path;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Topic fromPath(final String path) {
|
||||||
|
for (final Topic t : Topic.values()) {
|
||||||
|
if (t.getPath().equals(path)) {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -4,12 +4,23 @@ package eu.dnetlib.dhp.broker.oa;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Column;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -19,25 +30,75 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.broker.model.Event;
|
import eu.dnetlib.dhp.broker.model.Event;
|
||||||
import eu.dnetlib.dhp.broker.model.EventFactory;
|
import eu.dnetlib.dhp.broker.model.EventFactory;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAbstract;
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAuthorOrcid;
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingOpenAccess;
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPid;
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingProject;
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPublicationDate;
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingSubject;
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMoreOpenAccess;
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMorePid;
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMoreSubject;
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSoftware;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSoftware;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
|
||||||
public class GenerateEventsApplication {
|
public class GenerateEventsApplication {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
|
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
// Simple Matchers
|
||||||
|
private static final UpdateMatcher<Result, ?> enrichMissingAbstract = new EnrichMissingAbstract();
|
||||||
|
private static final UpdateMatcher<Result, ?> enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid();
|
||||||
|
private static final UpdateMatcher<Result, ?> enrichMissingOpenAccess = new EnrichMissingOpenAccess();
|
||||||
|
private static final UpdateMatcher<Result, ?> enrichMissingPid = new EnrichMissingPid();
|
||||||
|
private static final UpdateMatcher<Result, ?> enrichMissingPublicationDate = new EnrichMissingPublicationDate();
|
||||||
|
private static final UpdateMatcher<Result, ?> enrichMissingSubject = new EnrichMissingSubject();
|
||||||
|
private static final UpdateMatcher<Result, ?> enrichMoreOpenAccess = new EnrichMoreOpenAccess();
|
||||||
|
private static final UpdateMatcher<Result, ?> enrichMorePid = new EnrichMorePid();
|
||||||
|
private static final UpdateMatcher<Result, ?> enrichMoreSubject = new EnrichMoreSubject();
|
||||||
|
|
||||||
|
// Advanced matchers
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMissingProject = new EnrichMissingProject();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMoreProject = new EnrichMoreProject();
|
||||||
|
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMissingSoftware = new EnrichMissingSoftware();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMoreSoftware = new EnrichMoreSoftware();
|
||||||
|
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy();
|
||||||
|
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo();
|
||||||
|
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy();
|
||||||
|
|
||||||
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -60,9 +121,19 @@ public class GenerateEventsApplication {
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
removeOutputDir(spark, eventsPath);
|
removeOutputDir(spark, eventsPath);
|
||||||
generateEvents(spark, graphPath, eventsPath);
|
|
||||||
|
final JavaRDD<Event> eventsRdd = sc.emptyRDD();
|
||||||
|
|
||||||
|
eventsRdd.union(generateSimpleEvents(spark, graphPath, Publication.class));
|
||||||
|
eventsRdd.union(generateSimpleEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class));
|
||||||
|
eventsRdd.union(generateSimpleEvents(spark, graphPath, Software.class));
|
||||||
|
eventsRdd.union(generateSimpleEvents(spark, graphPath, OtherResearchProduct.class));
|
||||||
|
|
||||||
|
eventsRdd.saveAsTextFile(eventsPath, GzipCodec.class);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -71,42 +142,139 @@ public class GenerateEventsApplication {
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void generateEvents(final SparkSession spark, final String graphPath, final String eventsPath) {
|
private static <R extends Result> JavaRDD<Event> generateSimpleEvents(final SparkSession spark,
|
||||||
// TODO
|
final String graphPath,
|
||||||
|
final Class<R> resultClazz) {
|
||||||
|
|
||||||
|
final Dataset<R> results = readPath(
|
||||||
|
spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz)
|
||||||
|
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||||
|
|
||||||
|
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||||
|
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||||
|
|
||||||
|
final Column c = null; // TODO
|
||||||
|
|
||||||
|
final Dataset<Row> aa = results
|
||||||
|
.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner")
|
||||||
|
.groupBy(rels.col("target"))
|
||||||
|
.agg(c)
|
||||||
|
.filter(x -> x.size() > 1)
|
||||||
|
// generateSimpleEvents(...)
|
||||||
|
// flatMap()
|
||||||
|
// toRdd()
|
||||||
|
;
|
||||||
|
|
||||||
|
return null;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Event> generateEvents(final Result... children) {
|
private List<Event> generateSimpleEvents(final Collection<Result> children) {
|
||||||
final List<Event> list = new ArrayList<>();
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
for (final Result source : children) {
|
for (final Result target : children) {
|
||||||
for (final Result target : children) {
|
list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, children));
|
||||||
if (source != target) {
|
list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children));
|
||||||
list
|
list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children));
|
||||||
.addAll(
|
list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children));
|
||||||
findUpdates(source, target)
|
list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children));
|
||||||
.stream()
|
list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children));
|
||||||
.map(info -> EventFactory.newBrokerEvent(source, target, info))
|
list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children));
|
||||||
.collect(Collectors.toList()));
|
list.addAll(enrichMorePid.searchUpdatesForRecord(target, children));
|
||||||
}
|
list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, children));
|
||||||
|
}
|
||||||
|
|
||||||
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Event> generateProjectsEvents(final Collection<Pair<Result, List<Project>>> childrenWithProjects) {
|
||||||
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
|
for (final Pair<Result, List<Project>> target : childrenWithProjects) {
|
||||||
|
list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects));
|
||||||
|
list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects));
|
||||||
|
}
|
||||||
|
|
||||||
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Event> generateSoftwareEvents(final Collection<Pair<Result, List<Software>>> childrenWithSoftwares) {
|
||||||
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
|
for (final Pair<Result, List<Software>> target : childrenWithSoftwares) {
|
||||||
|
list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
|
||||||
|
list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
|
||||||
|
}
|
||||||
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Event> generatePublicationRelatedEvents(final String relType,
|
||||||
|
final Collection<Pair<Result, Map<String, List<Publication>>>> childrenWithRels) {
|
||||||
|
|
||||||
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
|
final List<Pair<Result, List<Publication>>> cleanedChildrens = childrenWithRels
|
||||||
|
.stream()
|
||||||
|
.filter(p -> p.getRight().containsKey(relType))
|
||||||
|
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
|
||||||
|
.filter(p -> p.getRight().size() > 0)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
for (final Pair<Result, List<Publication>> target : cleanedChildrens) {
|
||||||
|
if (relType.equals("isRelatedTo")) {
|
||||||
|
list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("references")) {
|
||||||
|
list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isReferencedBy")) {
|
||||||
|
list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isSupplementedTo")) {
|
||||||
|
list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isSupplementedBy")) {
|
||||||
|
list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return list;
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<UpdateInfo<?>> findUpdates(final Result source, final Result target) {
|
private List<Event> generateDatasetRelatedEvents(final String relType,
|
||||||
|
final Collection<Pair<Result, Map<String, List<eu.dnetlib.dhp.schema.oaf.Dataset>>>> childrenWithRels) {
|
||||||
|
|
||||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
list.addAll(EnrichMissingAbstract.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMissingAuthorOrcid.findUpdates(source, target));
|
final List<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>> cleanedChildrens = childrenWithRels
|
||||||
list.addAll(EnrichMissingOpenAccess.findUpdates(source, target));
|
.stream()
|
||||||
list.addAll(EnrichMissingPid.findUpdates(source, target));
|
.filter(p -> p.getRight().containsKey(relType))
|
||||||
list.addAll(EnrichMissingProject.findUpdates(source, target));
|
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
|
||||||
list.addAll(EnrichMissingPublicationDate.findUpdates(source, target));
|
.filter(p -> p.getRight().size() > 0)
|
||||||
list.addAll(EnrichMissingSubject.findUpdates(source, target));
|
.collect(Collectors.toList());
|
||||||
list.addAll(EnrichMoreOpenAccess.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMorePid.findUpdates(source, target));
|
for (final Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>> target : cleanedChildrens) {
|
||||||
list.addAll(EnrichMoreSubject.findUpdates(source, target));
|
if (relType.equals("isRelatedTo")) {
|
||||||
return list;
|
list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("references")) {
|
||||||
|
list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isReferencedBy")) {
|
||||||
|
list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isSupplementedTo")) {
|
||||||
|
list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
} else if (relType.equals("isSupplementedBy")) {
|
||||||
|
list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <R> Dataset<R> readPath(
|
||||||
|
final SparkSession spark,
|
||||||
|
final String inputPath,
|
||||||
|
final Class<R> clazz) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
|
||||||
|
public abstract class UpdateMatcher<K, T> {
|
||||||
|
|
||||||
|
private final boolean multipleUpdate;
|
||||||
|
|
||||||
|
public UpdateMatcher(final boolean multipleUpdate) {
|
||||||
|
this.multipleUpdate = multipleUpdate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final K res, final Collection<K> others) {
|
||||||
|
|
||||||
|
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
||||||
|
|
||||||
|
for (final K source : others) {
|
||||||
|
if (source != res) {
|
||||||
|
for (final UpdateInfo<T> info : findUpdates(source, res)) {
|
||||||
|
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
|
||||||
|
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
|
||||||
|
} else {
|
||||||
|
infoMap.put(s, info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final Collection<UpdateInfo<T>> values = infoMap.values();
|
||||||
|
|
||||||
|
if (values.isEmpty() || multipleUpdate) {
|
||||||
|
return values;
|
||||||
|
} else {
|
||||||
|
final UpdateInfo<T> v = values
|
||||||
|
.stream()
|
||||||
|
.sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust()))
|
||||||
|
.findFirst()
|
||||||
|
.get();
|
||||||
|
return Arrays.asList(v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract List<UpdateInfo<T>> findUpdates(K source, K target);
|
||||||
|
|
||||||
|
protected abstract UpdateInfo<T> generateUpdateInfo(final T highlightValue,
|
||||||
|
final K source,
|
||||||
|
final K target);
|
||||||
|
|
||||||
|
protected static boolean isMissing(final List<Field<String>> list) {
|
||||||
|
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean isMissing(final Field<String> field) {
|
||||||
|
return field == null || StringUtils.isBlank(field.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,63 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public abstract class AbstractEnrichMissingDataset
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
|
||||||
|
|
||||||
|
private final Topic topic;
|
||||||
|
|
||||||
|
public AbstractEnrichMissingDataset(final Topic topic) {
|
||||||
|
super(true);
|
||||||
|
this.topic = topic;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(
|
||||||
|
final Pair<Result, List<Dataset>> source,
|
||||||
|
final Pair<Result, List<Dataset>> target) {
|
||||||
|
|
||||||
|
final Set<String> existingDatasets = target
|
||||||
|
.getRight()
|
||||||
|
.stream()
|
||||||
|
.map(Dataset::getId)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getRight()
|
||||||
|
.stream()
|
||||||
|
.filter(d -> !existingDatasets.contains(d.getId()))
|
||||||
|
.map(ConversionUtils::oafDatasetToBrokerDataset)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Dataset highlightValue,
|
||||||
|
final Pair<Result, List<Dataset>> source,
|
||||||
|
final Pair<Result, List<Dataset>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
getTopic(),
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, rel) -> p.getDatasets().add(rel),
|
||||||
|
rel -> rel.getInstances().get(0).getUrl());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Topic getTopic() {
|
||||||
|
return topic;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetIsReferencedBy() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_IS_REFERENCED_BY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetIsRelatedTo() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_IS_RELATED_TO);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetIsSupplementedBy() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetIsSupplementedTo() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset {
|
||||||
|
|
||||||
|
public EnrichMissingDatasetReferences() {
|
||||||
|
super(Topic.ENRICH_MISSING_DATASET_REFERENCES);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingProject
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
|
||||||
|
|
||||||
|
public EnrichMissingProject() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
|
||||||
|
final Pair<Result, List<Project>> target) {
|
||||||
|
// TODO
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Project highlightValue,
|
||||||
|
final Pair<Result, List<Project>> source,
|
||||||
|
final Pair<Result, List<Project>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_PROJECT,
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, prj) -> p.getProjects().add(prj),
|
||||||
|
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
|
||||||
|
|
||||||
|
public EnrichMoreProject() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
|
||||||
|
final Pair<Result, List<Project>> target) {
|
||||||
|
// TODO
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Project highlightValue,
|
||||||
|
final Pair<Result, List<Project>> source,
|
||||||
|
final Pair<Result, List<Project>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MORE_PROJECT,
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, prj) -> p.getProjects().add(prj),
|
||||||
|
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,63 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public abstract class AbstractEnrichMissingPublication
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
|
||||||
|
|
||||||
|
private final Topic topic;
|
||||||
|
|
||||||
|
public AbstractEnrichMissingPublication(final Topic topic) {
|
||||||
|
super(true);
|
||||||
|
this.topic = topic;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
|
||||||
|
final Pair<Result, List<Publication>> source,
|
||||||
|
final Pair<Result, List<Publication>> target) {
|
||||||
|
|
||||||
|
final Set<String> existingPublications = target
|
||||||
|
.getRight()
|
||||||
|
.stream()
|
||||||
|
.map(Publication::getId)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getRight()
|
||||||
|
.stream()
|
||||||
|
.filter(d -> !existingPublications.contains(d.getId()))
|
||||||
|
.map(ConversionUtils::oafPublicationToBrokerPublication)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Publication highlightValue,
|
||||||
|
final Pair<Result, List<Publication>> source,
|
||||||
|
final Pair<Result, List<Publication>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
getTopic(),
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, rel) -> p.getPublications().add(rel),
|
||||||
|
rel -> rel.getInstances().get(0).getUrl());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Topic getTopic() {
|
||||||
|
return topic;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationIsReferencedBy() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationIsRelatedTo() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_IS_RELATED_TO);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationIsSupplementedBy() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationIsSupplementedTo() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPublication {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationReferences() {
|
||||||
|
super(Topic.ENRICH_MISSING_PUBLICATION_REFERENCES);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingAbstract extends UpdateMatcher<Result, String> {
|
||||||
|
|
||||||
|
public EnrichMissingAbstract() {
|
||||||
|
super(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
|
||||||
|
if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) {
|
||||||
|
return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target));
|
||||||
|
}
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_ABSTRACT,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, s) -> p.getAbstracts().add(s),
|
||||||
|
s -> s);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Result, Pair<String, String>> {
|
||||||
|
|
||||||
|
public EnrichMissingAuthorOrcid() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||||
|
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()),
|
||||||
|
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,56 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Instance;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
|
||||||
|
|
||||||
|
public EnrichMissingOpenAccess() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
|
||||||
|
final long count = target
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.map(i -> i.getAccessright().getClassid())
|
||||||
|
.filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if (count > 0) {
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||||
|
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||||
|
.flatMap(s -> s)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_OA_VERSION,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, i) -> p.getInstances().add(i),
|
||||||
|
Instance::getUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Pid;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingPid extends UpdateMatcher<Result, Pid> {
|
||||||
|
|
||||||
|
public EnrichMissingPid() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
|
||||||
|
final long count = target.getPid().size();
|
||||||
|
|
||||||
|
if (count > 0) {
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(ConversionUtils::oafPidToBrokerPid)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_PID,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pid) -> p.getPids().add(pid),
|
||||||
|
pid -> pid.getType() + "::" + pid.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationDate extends UpdateMatcher<Result, String> {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationDate() {
|
||||||
|
super(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
|
||||||
|
if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) {
|
||||||
|
return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target));
|
||||||
|
}
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_PUBLICATION_DATE,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, date) -> p.setPublicationdate(date),
|
||||||
|
s -> s);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,42 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
|
||||||
|
public class EnrichMissingSoftware
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
|
||||||
|
|
||||||
|
public EnrichMissingSoftware() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
|
||||||
|
final Pair<Result, List<Software>> source,
|
||||||
|
final Pair<Result, List<Software>> target) {
|
||||||
|
// TODO
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Software highlightValue,
|
||||||
|
final Pair<Result, List<Software>> source,
|
||||||
|
final Pair<Result, List<Software>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_SOFTWARE,
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, s) -> p.getSoftwares().add(s),
|
||||||
|
s -> s.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, String>> {
|
||||||
|
|
||||||
|
public EnrichMissingSubject() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||||
|
final Set<String> existingTypes = target
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.map(StructuredProperty::getQualifier)
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid()))
|
||||||
|
.map(ConversionUtils::oafSubjectToPair)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()),
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||||
|
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Instance;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
|
||||||
|
|
||||||
|
public EnrichMoreOpenAccess() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
|
||||||
|
final Set<String> urls = target
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||||
|
.map(i -> i.getUrl())
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||||
|
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||||
|
.flatMap(s -> s)
|
||||||
|
.filter(i -> !urls.contains(i.getUrl()))
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MORE_OA_VERSION,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, i) -> p.getInstances().add(i),
|
||||||
|
Instance::getUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,47 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Pid;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
|
||||||
|
|
||||||
|
public EnrichMorePid() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
|
||||||
|
final Set<String> existingPids = target
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
|
||||||
|
.map(ConversionUtils::oafPidToBrokerPid)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MORE_PID,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pid) -> p.getPids().add(pid),
|
||||||
|
pid -> pid.getType() + "::" + pid.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,42 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
|
||||||
|
public class EnrichMoreSoftware
|
||||||
|
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
|
||||||
|
|
||||||
|
public EnrichMoreSoftware() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
|
||||||
|
final Pair<Result, List<Software>> source,
|
||||||
|
final Pair<Result, List<Software>> target) {
|
||||||
|
// TODO
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
|
||||||
|
final eu.dnetlib.broker.objects.Software highlightValue,
|
||||||
|
final Pair<Result, List<Software>> source,
|
||||||
|
final Pair<Result, List<Software>> target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MORE_SOFTWARE,
|
||||||
|
highlightValue, source.getLeft(), target.getLeft(),
|
||||||
|
(p, s) -> p.getSoftwares().add(s),
|
||||||
|
s -> s.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String>> {
|
||||||
|
|
||||||
|
public EnrichMoreSubject() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||||
|
final Set<String> existingSubjects = target
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
|
||||||
|
.map(ConversionUtils::oafSubjectToPair)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()),
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||||
|
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,9 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.util;
|
||||||
|
|
||||||
|
public class BrokerConstants {
|
||||||
|
|
||||||
|
public final static String OPEN_ACCESS = "OPEN";
|
||||||
|
public final static String IS_MERGED_IN_CLASS = "isMergedIn";
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.util;
|
||||||
|
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Instance;
|
||||||
|
import eu.dnetlib.broker.objects.Pid;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
public class ConversionUtils {
|
||||||
|
|
||||||
|
public static Stream<Instance> oafInstanceToBrokerInstances(final eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||||
|
return i.getUrl().stream().map(url -> {
|
||||||
|
final Instance r = new Instance();
|
||||||
|
r.setUrl(url);
|
||||||
|
r.setInstancetype(i.getInstancetype().getClassid());
|
||||||
|
r.setLicense(BrokerConstants.OPEN_ACCESS);
|
||||||
|
r.setHostedby(i.getHostedby().getValue());
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Pid oafPidToBrokerPid(final StructuredProperty sp) {
|
||||||
|
final Pid pid = new Pid();
|
||||||
|
pid.setValue(sp.getValue());
|
||||||
|
pid.setType(sp.getQualifier().getClassid());
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final Pair<String, String> oafSubjectToPair(final StructuredProperty sp) {
|
||||||
|
return Pair.of(sp.getQualifier().getClassid(), sp.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) {
|
||||||
|
final eu.dnetlib.broker.objects.Dataset res = new eu.dnetlib.broker.objects.Dataset();
|
||||||
|
// TODO
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication d) {
|
||||||
|
final eu.dnetlib.broker.objects.Publication res = new eu.dnetlib.broker.objects.Publication();
|
||||||
|
// TODO
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,31 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingAbstract extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingAbstract> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingAbstract(final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/ABSTRACT", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getAbstracts().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,31 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingAuthorOrcid extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingAuthorOrcid> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingAuthorOrcid(final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/AUTHOR/ORCID", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
// TODO
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Instance;
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingOpenAccess extends UpdateInfo<Instance> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingOpenAccess> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingOpenAccess(final Instance highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/OPENACCESS_VERSION", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getInstances().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getUrl();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.broker.objects.Pid;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingPid extends UpdateInfo<Pid> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingPid> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingPid(final Pid highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/PID", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getPids().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getType() + "::" + getHighlightValue().getValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,33 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.broker.objects.Project;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingProject extends UpdateInfo<Project> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingProject> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingProject(final Project highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/PROJECT", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getProjects().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getFunder() + "::" + getHighlightValue().getFundingProgram()
|
|
||||||
+ getHighlightValue().getCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,31 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingPublicationDate extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingPublicationDate> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingPublicationDate(final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/PUBLICATION_DATE", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().setPublicationdate(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,36 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingSubject extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingSubject> findUpdates(final Result source, final Result target) {
|
|
||||||
// MESHEUROPMC
|
|
||||||
// ARXIV
|
|
||||||
// JEL
|
|
||||||
// DDC
|
|
||||||
// ACM
|
|
||||||
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingSubject(final String subjectClassification, final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/SUBJECT/" + subjectClassification, highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getSubjects().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Instance;
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMoreOpenAccess extends UpdateInfo<Instance> {
|
|
||||||
|
|
||||||
public static List<EnrichMoreOpenAccess> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMoreOpenAccess(final Instance highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MORE/OPENACCESS_VERSION", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getInstances().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getUrl();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.broker.objects.Pid;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMorePid extends UpdateInfo<Pid> {
|
|
||||||
|
|
||||||
public static List<EnrichMorePid> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMorePid(final Pid highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MORE/PID", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getPids().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getType() + "::" + getHighlightValue().getValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,36 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMoreSubject extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMoreSubject> findUpdates(final Result source, final Result target) {
|
|
||||||
// MESHEUROPMC
|
|
||||||
// ARXIV
|
|
||||||
// JEL
|
|
||||||
// DDC
|
|
||||||
// ACM
|
|
||||||
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMoreSubject(final String subjectClassification, final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MORE/SUBJECT/" + subjectClassification, highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getSubjects().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,36 +1,77 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
package eu.dnetlib.dhp.broker.oa.util;
|
||||||
|
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
||||||
|
import eu.dnetlib.broker.objects.Publication;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public abstract class UpdateInfo<T> {
|
public final class UpdateInfo<T> {
|
||||||
|
|
||||||
private final String topic;
|
private final Topic topic;
|
||||||
|
|
||||||
private final T highlightValue;
|
private final T highlightValue;
|
||||||
|
|
||||||
|
private final Result source;
|
||||||
|
|
||||||
|
private final Result target;
|
||||||
|
|
||||||
|
private final BiConsumer<Publication, T> compileHighlight;
|
||||||
|
|
||||||
|
private final Function<T, String> highlightToString;
|
||||||
|
|
||||||
private final float trust;
|
private final float trust;
|
||||||
|
|
||||||
protected UpdateInfo(final String topic, final T highlightValue, final float trust) {
|
public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target,
|
||||||
|
final BiConsumer<Publication, T> compileHighlight,
|
||||||
|
final Function<T, String> highlightToString) {
|
||||||
this.topic = topic;
|
this.topic = topic;
|
||||||
this.highlightValue = highlightValue;
|
this.highlightValue = highlightValue;
|
||||||
this.trust = trust;
|
this.source = source;
|
||||||
|
this.target = target;
|
||||||
|
this.compileHighlight = compileHighlight;
|
||||||
|
this.highlightToString = highlightToString;
|
||||||
|
this.trust = calculateTrust(source, target);
|
||||||
}
|
}
|
||||||
|
|
||||||
public T getHighlightValue() {
|
public T getHighlightValue() {
|
||||||
return highlightValue;
|
return highlightValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Result getSource() {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Result getTarget() {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
|
||||||
|
private float calculateTrust(final Result source, final Result target) {
|
||||||
|
// TODO
|
||||||
|
return 0.9f;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Topic getTopic() {
|
||||||
|
return topic;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTopicPath() {
|
||||||
|
return topic.getPath();
|
||||||
|
}
|
||||||
|
|
||||||
public float getTrust() {
|
public float getTrust() {
|
||||||
return trust;
|
return trust;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTopic() {
|
public void compileHighlight(final OpenAireEventPayload payload) {
|
||||||
return topic;
|
compileHighlight.accept(payload.getHighlight(), getHighlightValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
abstract public void compileHighlight(OpenAireEventPayload payload);
|
public String getHighlightValueAsString() {
|
||||||
|
return highlightToString.apply(getHighlightValue());
|
||||||
abstract public String getHighlightValueAsString();
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<artifactId>dhp-dedup-openaire</artifactId>
|
<artifactId>dhp-dedup-openaire</artifactId>
|
||||||
|
|
|
@ -0,0 +1,170 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class AuthorMerger {
|
||||||
|
|
||||||
|
private static final Double THRESHOLD = 0.95;
|
||||||
|
|
||||||
|
public static List<Author> merge(List<List<Author>> authors) {
|
||||||
|
|
||||||
|
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
|
||||||
|
|
||||||
|
List<Author> author = new ArrayList<>();
|
||||||
|
|
||||||
|
for (List<Author> a : authors) {
|
||||||
|
author = mergeAuthor(author, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
return author;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
||||||
|
int pa = countAuthorsPids(a);
|
||||||
|
int pb = countAuthorsPids(b);
|
||||||
|
List<Author> base, enrich;
|
||||||
|
int sa = authorsSize(a);
|
||||||
|
int sb = authorsSize(b);
|
||||||
|
|
||||||
|
if (pa == pb) {
|
||||||
|
base = sa > sb ? a : b;
|
||||||
|
enrich = sa > sb ? b : a;
|
||||||
|
} else {
|
||||||
|
base = pa > pb ? a : b;
|
||||||
|
enrich = pa > pb ? b : a;
|
||||||
|
}
|
||||||
|
enrichPidFromList(base, enrich);
|
||||||
|
return base;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
||||||
|
if (base == null || enrich == null)
|
||||||
|
return;
|
||||||
|
final Map<String, Author> basePidAuthorMap = base
|
||||||
|
.stream()
|
||||||
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
|
.flatMap(
|
||||||
|
a -> a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
|
||||||
|
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||||
|
|
||||||
|
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
||||||
|
.stream()
|
||||||
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
|
.flatMap(
|
||||||
|
a -> a
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
|
||||||
|
.map(p -> new Tuple2<>(p, a)))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
pidToEnrich
|
||||||
|
.forEach(
|
||||||
|
a -> {
|
||||||
|
Optional<Tuple2<Double, Author>> simAuthor = base
|
||||||
|
.stream()
|
||||||
|
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
||||||
|
.max(Comparator.comparing(Tuple2::_1));
|
||||||
|
|
||||||
|
if (simAuthor.isPresent()) {
|
||||||
|
double th = THRESHOLD;
|
||||||
|
// increase the threshold if the surname is too short
|
||||||
|
if (simAuthor.get()._2().getSurname() != null
|
||||||
|
&& simAuthor.get()._2().getSurname().length() <= 3)
|
||||||
|
th = 0.99;
|
||||||
|
|
||||||
|
if (simAuthor.get()._1() > th) {
|
||||||
|
Author r = simAuthor.get()._2();
|
||||||
|
if (r.getPid() == null) {
|
||||||
|
r.setPid(new ArrayList<>());
|
||||||
|
}
|
||||||
|
r.getPid().add(a._1());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String pidToComparableString(StructuredProperty pid) {
|
||||||
|
return (pid.getQualifier() != null
|
||||||
|
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
|
||||||
|
: "")
|
||||||
|
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int countAuthorsPids(List<Author> authors) {
|
||||||
|
if (authors == null)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return (int) authors.stream().filter(AuthorMerger::hasPid).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int authorsSize(List<Author> authors) {
|
||||||
|
if (authors == null)
|
||||||
|
return 0;
|
||||||
|
return authors.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double sim(Author a, Author b) {
|
||||||
|
|
||||||
|
final Person pa = parse(a);
|
||||||
|
final Person pb = parse(b);
|
||||||
|
|
||||||
|
// if both are accurate (e.g. they have name and surname)
|
||||||
|
if (pa.isAccurate() & pb.isAccurate()) {
|
||||||
|
return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
|
||||||
|
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
|
||||||
|
} else {
|
||||||
|
return new JaroWinkler()
|
||||||
|
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean hasPid(Author a) {
|
||||||
|
if (a == null || a.getPid() == null || a.getPid().size() == 0)
|
||||||
|
return false;
|
||||||
|
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Person parse(Author author) {
|
||||||
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
||||||
|
return new Person(author.getSurname() + ", " + author.getName(), false);
|
||||||
|
} else {
|
||||||
|
return new Person(author.getFullname(), false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String normalize(final String s) {
|
||||||
|
return nfd(s)
|
||||||
|
.toLowerCase()
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||||
|
// in case
|
||||||
|
// of large input strings
|
||||||
|
.replaceAll("(\\W)+", " ")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,8 +1,10 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
|
@ -67,16 +69,19 @@ public class DedupRecordFactory {
|
||||||
(MapFunction<Tuple2<String, T>, String>) entity -> entity._1(), Encoders.STRING())
|
(MapFunction<Tuple2<String, T>, String>) entity -> entity._1(), Encoders.STRING())
|
||||||
.mapGroups(
|
.mapGroups(
|
||||||
(MapGroupsFunction<String, Tuple2<String, T>, T>) (key,
|
(MapGroupsFunction<String, Tuple2<String, T>, T>) (key,
|
||||||
values) -> entityMerger(key, values, ts, dataInfo),
|
values) -> entityMerger(key, values, ts, dataInfo, clazz),
|
||||||
Encoders.bean(clazz));
|
Encoders.bean(clazz));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends OafEntity> T entityMerger(
|
public static <T extends OafEntity> T entityMerger(
|
||||||
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo) {
|
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
|
||||||
|
throws IllegalAccessException, InstantiationException {
|
||||||
|
|
||||||
T entity = entities.next()._2();
|
T entity = clazz.newInstance();
|
||||||
|
|
||||||
final Collection<String> dates = Lists.newArrayList();
|
final Collection<String> dates = Lists.newArrayList();
|
||||||
|
final List<List<Author>> authors = Lists.newArrayList();
|
||||||
|
|
||||||
entities
|
entities
|
||||||
.forEachRemaining(
|
.forEachRemaining(
|
||||||
t -> {
|
t -> {
|
||||||
|
@ -84,17 +89,17 @@ public class DedupRecordFactory {
|
||||||
entity.mergeFrom(duplicate);
|
entity.mergeFrom(duplicate);
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
Result r1 = (Result) duplicate;
|
Result r1 = (Result) duplicate;
|
||||||
Result er = (Result) entity;
|
if (r1.getAuthor() != null && r1.getAuthor().size() > 0)
|
||||||
er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor()));
|
authors.add(r1.getAuthor());
|
||||||
|
if (r1.getDateofacceptance() != null)
|
||||||
if (r1.getDateofacceptance() != null) {
|
|
||||||
dates.add(r1.getDateofacceptance().getValue());
|
dates.add(r1.getDateofacceptance().getValue());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// set authors and date
|
||||||
if (ModelSupport.isSubClass(entity, Result.class)) {
|
if (ModelSupport.isSubClass(entity, Result.class)) {
|
||||||
((Result) entity).setDateofacceptance(DatePicker.pick(dates));
|
((Result) entity).setDateofacceptance(DatePicker.pick(dates));
|
||||||
|
((Result) entity).setAuthor(AuthorMerger.merge(authors));
|
||||||
}
|
}
|
||||||
|
|
||||||
entity.setId(id);
|
entity.setId(id);
|
||||||
|
|
|
@ -32,7 +32,6 @@ import eu.dnetlib.pace.model.Person;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class DedupUtility {
|
public class DedupUtility {
|
||||||
private static final Double THRESHOLD = 0.95;
|
|
||||||
|
|
||||||
public static Map<String, LongAccumulator> constructAccumulator(
|
public static Map<String, LongAccumulator> constructAccumulator(
|
||||||
final DedupConfig dedupConf, final SparkContext context) {
|
final DedupConfig dedupConf, final SparkContext context) {
|
||||||
|
@ -82,58 +81,6 @@ public class DedupUtility {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
|
||||||
int pa = countAuthorsPids(a);
|
|
||||||
int pb = countAuthorsPids(b);
|
|
||||||
List<Author> base, enrich;
|
|
||||||
int sa = authorsSize(a);
|
|
||||||
int sb = authorsSize(b);
|
|
||||||
|
|
||||||
if (pa == pb) {
|
|
||||||
base = sa > sb ? a : b;
|
|
||||||
enrich = sa > sb ? b : a;
|
|
||||||
} else {
|
|
||||||
base = pa > pb ? a : b;
|
|
||||||
enrich = pa > pb ? b : a;
|
|
||||||
}
|
|
||||||
enrichPidFromList(base, enrich);
|
|
||||||
return base;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
|
||||||
if (base == null || enrich == null)
|
|
||||||
return;
|
|
||||||
final Map<String, Author> basePidAuthorMap = base
|
|
||||||
.stream()
|
|
||||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
|
||||||
.flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a)))
|
|
||||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
|
||||||
|
|
||||||
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
|
||||||
.stream()
|
|
||||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
|
||||||
.flatMap(
|
|
||||||
a -> a
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.filter(p -> !basePidAuthorMap.containsKey(p.toComparableString()))
|
|
||||||
.map(p -> new Tuple2<>(p, a)))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
pidToEnrich
|
|
||||||
.forEach(
|
|
||||||
a -> {
|
|
||||||
Optional<Tuple2<Double, Author>> simAuhtor = base
|
|
||||||
.stream()
|
|
||||||
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
|
||||||
.max(Comparator.comparing(Tuple2::_1));
|
|
||||||
if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
|
|
||||||
Author r = simAuhtor.get()._2();
|
|
||||||
r.getPid().add(a._1());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String createDedupRecordPath(
|
public static String createDedupRecordPath(
|
||||||
final String basePath, final String actionSetId, final String entityType) {
|
final String basePath, final String actionSetId, final String entityType) {
|
||||||
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);
|
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);
|
||||||
|
@ -153,65 +100,6 @@ public class DedupUtility {
|
||||||
return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType);
|
return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Double sim(Author a, Author b) {
|
|
||||||
|
|
||||||
final Person pa = parse(a);
|
|
||||||
final Person pb = parse(b);
|
|
||||||
|
|
||||||
if (pa.isAccurate() & pb.isAccurate()) {
|
|
||||||
return new JaroWinkler()
|
|
||||||
.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
|
|
||||||
} else {
|
|
||||||
return new JaroWinkler()
|
|
||||||
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String normalize(final String s) {
|
|
||||||
return nfd(s)
|
|
||||||
.toLowerCase()
|
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
|
||||||
// in case
|
|
||||||
// of large input strings
|
|
||||||
.replaceAll("(\\W)+", " ")
|
|
||||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
|
||||||
.replaceAll("(\\d)+", " ")
|
|
||||||
.replaceAll("(\\n)+", " ")
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String nfd(final String s) {
|
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Person parse(Author author) {
|
|
||||||
if (StringUtils.isNotBlank(author.getSurname())) {
|
|
||||||
return new Person(author.getSurname() + ", " + author.getName(), false);
|
|
||||||
} else {
|
|
||||||
return new Person(author.getFullname(), false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int countAuthorsPids(List<Author> authors) {
|
|
||||||
if (authors == null)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return (int) authors.stream().filter(DedupUtility::hasPid).count();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int authorsSize(List<Author> authors) {
|
|
||||||
if (authors == null)
|
|
||||||
return 0;
|
|
||||||
return authors.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean hasPid(Author a) {
|
|
||||||
if (a == null || a.getPid() == null || a.getPid().size() == 0)
|
|
||||||
return false;
|
|
||||||
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<DedupConfig> getConfigurations(String isLookUpUrl, String orchestrator)
|
public static List<DedupConfig> getConfigurations(String isLookUpUrl, String orchestrator)
|
||||||
throws ISLookUpException, DocumentException {
|
throws ISLookUpException, DocumentException {
|
||||||
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl);
|
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl);
|
||||||
|
|
|
@ -0,0 +1,160 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class EntityMergerTest implements Serializable {
|
||||||
|
|
||||||
|
List<Tuple2<String, Publication>> publications;
|
||||||
|
List<Tuple2<String, Publication>> publications2;
|
||||||
|
|
||||||
|
String testEntityBasePath;
|
||||||
|
DataInfo dataInfo;
|
||||||
|
String dedupId = "dedup_id";
|
||||||
|
Publication pub_top;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
|
||||||
|
testEntityBasePath = Paths
|
||||||
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
||||||
|
.toFile()
|
||||||
|
.getAbsolutePath();
|
||||||
|
|
||||||
|
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
||||||
|
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
|
||||||
|
|
||||||
|
pub_top = getTopPub(publications);
|
||||||
|
|
||||||
|
dataInfo = setDI();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void publicationMergerTest() throws InstantiationException, IllegalAccessException {
|
||||||
|
|
||||||
|
Publication pub_merged = DedupRecordFactory
|
||||||
|
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
|
assertEquals(dedupId, pub_merged.getId());
|
||||||
|
|
||||||
|
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
|
||||||
|
assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright());
|
||||||
|
assertEquals(pub_merged.getResulttype(), pub_top.getResulttype());
|
||||||
|
assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage());
|
||||||
|
assertEquals(pub_merged.getPublisher(), pub_top.getPublisher());
|
||||||
|
assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate());
|
||||||
|
assertEquals(pub_merged.getResourcetype().getClassid(), "0004");
|
||||||
|
assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation());
|
||||||
|
assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance());
|
||||||
|
assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection());
|
||||||
|
assertEquals(pub_merged.getInstance().size(), 3);
|
||||||
|
assertEquals(pub_merged.getCountry().size(), 2);
|
||||||
|
assertEquals(pub_merged.getSubject().size(), 0);
|
||||||
|
assertEquals(pub_merged.getTitle().size(), 2);
|
||||||
|
assertEquals(pub_merged.getRelevantdate().size(), 0);
|
||||||
|
assertEquals(pub_merged.getDescription().size(), 0);
|
||||||
|
assertEquals(pub_merged.getSource().size(), 0);
|
||||||
|
assertEquals(pub_merged.getFulltext().size(), 0);
|
||||||
|
assertEquals(pub_merged.getFormat().size(), 0);
|
||||||
|
assertEquals(pub_merged.getContributor().size(), 0);
|
||||||
|
assertEquals(pub_merged.getCoverage().size(), 0);
|
||||||
|
assertEquals(pub_merged.getContext().size(), 0);
|
||||||
|
assertEquals(pub_merged.getExternalReference().size(), 0);
|
||||||
|
assertEquals(pub_merged.getOriginalId().size(), 3);
|
||||||
|
assertEquals(pub_merged.getCollectedfrom().size(), 3);
|
||||||
|
assertEquals(pub_merged.getPid().size(), 1);
|
||||||
|
assertEquals(pub_merged.getExtraInfo().size(), 0);
|
||||||
|
|
||||||
|
// verify datainfo
|
||||||
|
assertEquals(pub_merged.getDataInfo(), dataInfo);
|
||||||
|
|
||||||
|
// verify datepicker
|
||||||
|
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
|
||||||
|
|
||||||
|
// verify authors
|
||||||
|
assertEquals(pub_merged.getAuthor().size(), 9);
|
||||||
|
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
||||||
|
|
||||||
|
//verify title
|
||||||
|
int count = 0;
|
||||||
|
for (StructuredProperty title: pub_merged.getTitle()){
|
||||||
|
if (title.getQualifier().getClassid().equals("main title"))
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(count, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void publicationMergerTest2() throws InstantiationException, IllegalAccessException {
|
||||||
|
|
||||||
|
Publication pub_merged = DedupRecordFactory
|
||||||
|
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
|
assertEquals(pub_merged.getAuthor().size(), 27);
|
||||||
|
// insert assertions here
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public DataInfo setDI() {
|
||||||
|
DataInfo dataInfo = new DataInfo();
|
||||||
|
dataInfo.setTrust("0.9");
|
||||||
|
dataInfo.setDeletedbyinference(false);
|
||||||
|
dataInfo.setInferenceprovenance("testing");
|
||||||
|
dataInfo.setInferred(true);
|
||||||
|
return dataInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Publication getTopPub(List<Tuple2<String, Publication>> publications) {
|
||||||
|
|
||||||
|
Double maxTrust = 0.0;
|
||||||
|
Publication maxPub = new Publication();
|
||||||
|
for (Tuple2<String, Publication> publication : publications) {
|
||||||
|
Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
|
||||||
|
if (pubTrust > maxTrust) {
|
||||||
|
maxTrust = pubTrust;
|
||||||
|
maxPub = publication._2();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return maxPub;
|
||||||
|
}
|
||||||
|
|
||||||
|
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||||
|
List<Tuple2<String, T>> res = new ArrayList<>();
|
||||||
|
BufferedReader reader;
|
||||||
|
try {
|
||||||
|
reader = new BufferedReader(new FileReader(path));
|
||||||
|
String line = reader.readLine();
|
||||||
|
while (line != null) {
|
||||||
|
res
|
||||||
|
.add(
|
||||||
|
new Tuple2<>(
|
||||||
|
MapDocumentUtil.getJPathString("$.id", line),
|
||||||
|
new ObjectMapper().readValue(line, clazz)));
|
||||||
|
// read next line
|
||||||
|
line = reader.readLine();
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,54 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
|
|
||||||
public class MergeAuthorTest {
|
|
||||||
|
|
||||||
private List<Publication> publicationsToMerge;
|
|
||||||
private final ObjectMapper mapper = new ObjectMapper();
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
public void setUp() throws Exception {
|
|
||||||
final String json = IOUtils
|
|
||||||
.toString(
|
|
||||||
this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json"));
|
|
||||||
|
|
||||||
publicationsToMerge = Arrays
|
|
||||||
.asList(json.split("\n"))
|
|
||||||
.stream()
|
|
||||||
.map(
|
|
||||||
s -> {
|
|
||||||
try {
|
|
||||||
return mapper.readValue(s, Publication.class);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIX ME Michele DB this tests doesn't work
|
|
||||||
// @Test
|
|
||||||
public void test() throws Exception {
|
|
||||||
Publication dedup = new Publication();
|
|
||||||
|
|
||||||
publicationsToMerge
|
|
||||||
.forEach(
|
|
||||||
p -> {
|
|
||||||
dedup.mergeFrom(p);
|
|
||||||
dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(), p.getAuthor()));
|
|
||||||
});
|
|
||||||
|
|
||||||
System.out.println(mapper.writeValueAsString(dedup));
|
|
||||||
}
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,9 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class PropagationConstant {
|
public class PropagationConstant {
|
||||||
|
@ -24,10 +26,6 @@ public class PropagationConstant {
|
||||||
|
|
||||||
public static final String TRUE = "true";
|
public static final String TRUE = "true";
|
||||||
|
|
||||||
public static final String DNET_COUNTRY_SCHEMA = "dnet:countries";
|
|
||||||
public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
|
|
||||||
public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
|
|
||||||
|
|
||||||
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos";
|
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos";
|
||||||
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories";
|
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories";
|
||||||
|
|
||||||
|
@ -46,22 +44,6 @@ public class PropagationConstant {
|
||||||
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
|
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
|
||||||
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
|
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
|
||||||
|
|
||||||
public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy";
|
|
||||||
|
|
||||||
public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization";
|
|
||||||
public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation";
|
|
||||||
public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf";
|
|
||||||
public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution";
|
|
||||||
|
|
||||||
public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult";
|
|
||||||
|
|
||||||
public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject";
|
|
||||||
public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome";
|
|
||||||
public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy";
|
|
||||||
public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces";
|
|
||||||
|
|
||||||
public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges";
|
|
||||||
|
|
||||||
public static final String PROPAGATION_AUTHOR_PID = "ORCID";
|
public static final String PROPAGATION_AUTHOR_PID = "ORCID";
|
||||||
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
@ -76,8 +58,8 @@ public class PropagationConstant {
|
||||||
Country nc = new Country();
|
Country nc = new Country();
|
||||||
nc.setClassid(classid);
|
nc.setClassid(classid);
|
||||||
nc.setClassname(classname);
|
nc.setClassname(classname);
|
||||||
nc.setSchemename(DNET_COUNTRY_SCHEMA);
|
nc.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
|
||||||
nc.setSchemeid(DNET_COUNTRY_SCHEMA);
|
nc.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
|
||||||
nc
|
nc
|
||||||
.setDataInfo(
|
.setDataInfo(
|
||||||
getDataInfo(
|
getDataInfo(
|
||||||
|
@ -102,8 +84,8 @@ public class PropagationConstant {
|
||||||
Qualifier pa = new Qualifier();
|
Qualifier pa = new Qualifier();
|
||||||
pa.setClassid(inference_class_id);
|
pa.setClassid(inference_class_id);
|
||||||
pa.setClassname(inference_class_name);
|
pa.setClassname(inference_class_name);
|
||||||
pa.setSchemeid(DNET_SCHEMA_ID);
|
pa.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||||
pa.setSchemename(DNET_SCHEMA_NAME);
|
pa.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||||
return pa;
|
return pa;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.bulktag;
|
package eu.dnetlib.dhp.bulktag;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
@ -84,6 +85,7 @@ public class SparkBulkTagJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
|
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -131,7 +131,7 @@ public class CommunityConfiguration implements Serializable {
|
||||||
p -> {
|
p -> {
|
||||||
if (p.getSnd() == null)
|
if (p.getSnd() == null)
|
||||||
return p.getFst();
|
return p.getFst();
|
||||||
if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
|
if (p.getSnd().verifyCriteria(param))
|
||||||
return p.getFst();
|
return p.getFst();
|
||||||
else
|
else
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class VerbResolver implements Serializable {
|
||||||
.collect(
|
.collect(
|
||||||
Collectors
|
Collectors
|
||||||
.toMap(
|
.toMap(
|
||||||
value -> (String) ((ClassInfo) value)
|
value -> (String) value
|
||||||
.getAnnotationInfo()
|
.getAnnotationInfo()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getParameterValues()
|
.getParameterValues()
|
||||||
|
|
|
@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -76,9 +77,15 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
List<String> allowedtypes,
|
List<String> allowedtypes,
|
||||||
String inputPath,
|
String inputPath,
|
||||||
String outputPath) {
|
String outputPath) {
|
||||||
String whitelisted = "";
|
String whitelisted = " d.id = '" + whitelist.get(0) + "'";
|
||||||
for (String i : whitelist) {
|
for (int i = 1; i < whitelist.size(); i++) {
|
||||||
whitelisted += " OR id = '" + i + "'";
|
whitelisted += " OR d.id = '" + whitelist.get(i) + "'";
|
||||||
|
}
|
||||||
|
|
||||||
|
String allowed = "d.datasourcetype.classid = '" + allowedtypes.get(0) + "'";
|
||||||
|
|
||||||
|
for (int i = 1; i < allowedtypes.size(); i++) {
|
||||||
|
allowed += " OR d.datasourcetype.classid = '" + allowedtypes.get(i) + "'";
|
||||||
}
|
}
|
||||||
|
|
||||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||||
|
@ -89,26 +96,39 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
relation.createOrReplaceTempView("relation");
|
relation.createOrReplaceTempView("relation");
|
||||||
organization.createOrReplaceTempView("organization");
|
organization.createOrReplaceTempView("organization");
|
||||||
|
|
||||||
String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
|
// String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
|
||||||
+ "FROM ( SELECT id "
|
// + "FROM ( SELECT id "
|
||||||
+ " FROM datasource "
|
// + " FROM datasource "
|
||||||
+ " WHERE (datainfo.deletedbyinference = false "
|
// + " WHERE (datainfo.deletedbyinference = false "
|
||||||
+ whitelisted
|
// + whitelisted
|
||||||
+ ") "
|
// + ") "
|
||||||
+ getConstraintList("datasourcetype.classid = '", allowedtypes)
|
// + getConstraintList("datasourcetype.classid = '", allowedtypes)
|
||||||
+ ") d "
|
// + ") d "
|
||||||
+ "JOIN ( SELECT source, target "
|
// + "JOIN ( SELECT source, target "
|
||||||
+ " FROM relation "
|
// + " FROM relation "
|
||||||
+ " WHERE relclass = '"
|
// + " WHERE relclass = '"
|
||||||
+ RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
|
// + ModelConstants.IS_PROVIDED_BY
|
||||||
+ "' "
|
// + "' "
|
||||||
+ " AND datainfo.deletedbyinference = false ) rel "
|
// + " AND datainfo.deletedbyinference = false ) rel "
|
||||||
+ "ON d.id = rel.source "
|
// + "ON d.id = rel.source "
|
||||||
+ "JOIN (SELECT id, country "
|
// + "JOIN (SELECT id, country "
|
||||||
+ " FROM organization "
|
// + " FROM organization "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
// + " WHERE datainfo.deletedbyinference = false "
|
||||||
+ " AND length(country.classid) > 0) o "
|
// + " AND length(country.classid) > 0) o "
|
||||||
+ "ON o.id = rel.target";
|
// + "ON o.id = rel.target";
|
||||||
|
|
||||||
|
String query = "SELECT source dataSourceId, " +
|
||||||
|
"named_struct('classid', country.classid, 'classname', country.classname) country " +
|
||||||
|
"FROM datasource d " +
|
||||||
|
"JOIN relation rel " +
|
||||||
|
"ON d.id = rel.source " +
|
||||||
|
"JOIN organization o " +
|
||||||
|
"ON o.id = rel.target " +
|
||||||
|
"WHERE rel.datainfo.deletedbyinference = false " +
|
||||||
|
"and rel.relclass = '" + ModelConstants.IS_PROVIDED_BY + "'" +
|
||||||
|
"and o.datainfo.deletedbyinference = false " +
|
||||||
|
"and length(o.country.classid) > 0 " +
|
||||||
|
"and (" + allowed + " or " + whitelisted + ")";
|
||||||
|
|
||||||
spark
|
spark
|
||||||
.sql(query)
|
.sql(query)
|
||||||
|
|
|
@ -4,7 +4,12 @@ package eu.dnetlib.dhp.countrypropagation;
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -13,6 +18,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class PrepareResultCountrySet {
|
public class PrepareResultCountrySet {
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
|
||||||
|
@ -60,6 +66,7 @@ public class PrepareResultCountrySet {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
getPotentialResultToUpdate(
|
getPotentialResultToUpdate(
|
||||||
spark,
|
spark,
|
||||||
inputPath,
|
inputPath,
|
||||||
|
@ -89,10 +96,33 @@ public class PrepareResultCountrySet {
|
||||||
spark
|
spark
|
||||||
.sql(RESULT_COUNTRYSET_QUERY)
|
.sql(RESULT_COUNTRYSET_QUERY)
|
||||||
.as(Encoders.bean(ResultCountrySet.class))
|
.as(Encoders.bean(ResultCountrySet.class))
|
||||||
.write()
|
.toJavaRDD()
|
||||||
.option("compression", "gzip")
|
.mapToPair(value -> new Tuple2<>(value.getResultId(), value))
|
||||||
.mode(SaveMode.Append)
|
.reduceByKey((a, b) -> {
|
||||||
.json(outputPath);
|
ArrayList<CountrySbs> countryList = a.getCountrySet();
|
||||||
|
Set<String> countryCodes = countryList
|
||||||
|
.stream()
|
||||||
|
.map(country -> country.getClassid())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
b
|
||||||
|
.getCountrySet()
|
||||||
|
.stream()
|
||||||
|
.forEach(c -> {
|
||||||
|
if (!countryCodes.contains(c.getClassid())) {
|
||||||
|
countryList.add(c);
|
||||||
|
countryCodes.add(c.getClassid());
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
a.setCountrySet(countryList);
|
||||||
|
return a;
|
||||||
|
})
|
||||||
|
.map(couple -> OBJECT_MAPPER.writeValueAsString(couple._2()))
|
||||||
|
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||||
|
// .write()
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .mode(SaveMode.Append)
|
||||||
|
// .json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,13 +69,16 @@ public class SparkCountryPropagationJob {
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> execPropagation(
|
spark -> {
|
||||||
spark,
|
removeOutputDir(spark, outputPath);
|
||||||
sourcePath,
|
execPropagation(
|
||||||
preparedInfoPath,
|
spark,
|
||||||
outputPath,
|
sourcePath,
|
||||||
resultClazz,
|
preparedInfoPath,
|
||||||
saveGraph));
|
outputPath,
|
||||||
|
resultClazz,
|
||||||
|
saveGraph);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <R extends Result> void execPropagation(
|
private static <R extends Result> void execPropagation(
|
||||||
|
|
|
@ -74,9 +74,7 @@ public class PrepareResultOrcidAssociationStep1 {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
|
||||||
}
|
|
||||||
prepareInfo(
|
prepareInfo(
|
||||||
spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
|
spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
|
||||||
});
|
});
|
||||||
|
@ -97,22 +95,22 @@ public class PrepareResultOrcidAssociationStep1 {
|
||||||
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
|
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
|
||||||
result.createOrReplaceTempView("result");
|
result.createOrReplaceTempView("result");
|
||||||
|
|
||||||
String query = " select target resultId, author authorList"
|
String query = "SELECT target resultId, author authorList"
|
||||||
+ " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
|
+ " FROM (SELECT id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
|
||||||
+ " from ( "
|
+ " FROM ( "
|
||||||
+ " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
|
+ " SELECT DISTINCT id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
|
||||||
+ " from result "
|
+ " FROM result "
|
||||||
+ " lateral view explode (author) a as MyT "
|
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
|
||||||
+ " lateral view explode (MyT.pid) p as MyP "
|
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
|
||||||
+ " where MyP.qualifier.classid = 'ORCID') tmp "
|
+ " WHERE MyP.qualifier.classid = 'ORCID') tmp "
|
||||||
+ " group by id) r_t "
|
+ " GROUP BY id) r_t "
|
||||||
+ " join ("
|
+ " JOIN ("
|
||||||
+ " select source, target "
|
+ " SELECT source, target "
|
||||||
+ " from relation "
|
+ " FROM relation "
|
||||||
+ " where datainfo.deletedbyinference = false "
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
+ getConstraintList(" relclass = '", allowedsemrel)
|
+ getConstraintList(" relclass = '", allowedsemrel)
|
||||||
+ ") rel_rel "
|
+ " ) rel_rel "
|
||||||
+ " on source = id";
|
+ " ON source = id";
|
||||||
spark
|
spark
|
||||||
.sql(query)
|
.sql(query)
|
||||||
.as(Encoders.bean(ResultOrcidList.class))
|
.as(Encoders.bean(ResultOrcidList.class))
|
||||||
|
|
|
@ -50,9 +50,7 @@ public class PrepareResultOrcidAssociationStep2 {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
|
||||||
}
|
|
||||||
mergeInfo(spark, inputPath, outputPath);
|
mergeInfo(spark, inputPath, outputPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
@ -70,11 +71,10 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
if (saveGraph) {
|
||||||
}
|
|
||||||
if (saveGraph)
|
|
||||||
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
|
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,40 +122,51 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void enrichAuthor(Author a, List<AutoritativeAuthor> au) {
|
private static void enrichAuthor(Author a, List<AutoritativeAuthor> au) {
|
||||||
|
PacePerson pp = new PacePerson(a.getFullname(), false);
|
||||||
for (AutoritativeAuthor aa : au) {
|
for (AutoritativeAuthor aa : au) {
|
||||||
if (enrichAuthor(aa, a)) {
|
if (enrichAuthor(aa, a, pp.getNormalisedFirstName(), pp.getNormalisedSurname())) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) {
|
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author,
|
||||||
|
String author_name,
|
||||||
|
String author_surname) {
|
||||||
boolean toaddpid = false;
|
boolean toaddpid = false;
|
||||||
|
|
||||||
if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) {
|
if (StringUtils.isNotEmpty(autoritative_author.getSurname())) {
|
||||||
if (StringUtils.isNoneEmpty(author.getSurname())) {
|
if (StringUtils.isNotEmpty(author.getSurname())) {
|
||||||
|
author_surname = author.getSurname();
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotEmpty(author_surname)) {
|
||||||
if (autoritative_author
|
if (autoritative_author
|
||||||
.getSurname()
|
.getSurname()
|
||||||
.trim()
|
.trim()
|
||||||
.equalsIgnoreCase(author.getSurname().trim())) {
|
.equalsIgnoreCase(author_surname.trim())) {
|
||||||
|
|
||||||
// have the same surname. Check the name
|
// have the same surname. Check the name
|
||||||
if (StringUtils.isNoneEmpty(autoritative_author.getName())) {
|
if (StringUtils.isNotEmpty(autoritative_author.getName())) {
|
||||||
if (StringUtils.isNoneEmpty(author.getName())) {
|
if (StringUtils.isNotEmpty(author.getName())) {
|
||||||
|
author_name = author.getName();
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotEmpty(author_name)) {
|
||||||
if (autoritative_author
|
if (autoritative_author
|
||||||
.getName()
|
.getName()
|
||||||
.trim()
|
.trim()
|
||||||
.equalsIgnoreCase(author.getName().trim())) {
|
.equalsIgnoreCase(author_name.trim())) {
|
||||||
toaddpid = true;
|
toaddpid = true;
|
||||||
}
|
}
|
||||||
// they could be differently written (i.e. only the initials of the name
|
// they could be differently written (i.e. only the initials of the name
|
||||||
// in one of the two
|
// in one of the two
|
||||||
if (autoritative_author
|
else {
|
||||||
.getName()
|
if (autoritative_author
|
||||||
.trim()
|
.getName()
|
||||||
.substring(0, 0)
|
.trim()
|
||||||
.equalsIgnoreCase(author.getName().trim().substring(0, 0))) {
|
.substring(0, 0)
|
||||||
toaddpid = true;
|
.equalsIgnoreCase(author_name.trim().substring(0, 0))) {
|
||||||
|
toaddpid = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
public class PrepareProjectResultsAssociation {
|
public class PrepareProjectResultsAssociation {
|
||||||
|
@ -60,6 +61,8 @@ public class PrepareProjectResultsAssociation {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, potentialUpdatePath);
|
||||||
|
removeOutputDir(spark, alreadyLinkedPath);
|
||||||
prepareResultProjProjectResults(
|
prepareResultProjProjectResults(
|
||||||
spark,
|
spark,
|
||||||
inputPath,
|
inputPath,
|
||||||
|
@ -83,7 +86,7 @@ public class PrepareProjectResultsAssociation {
|
||||||
+ " FROM relation "
|
+ " FROM relation "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
+ " AND relClass = '"
|
+ " AND relClass = '"
|
||||||
+ RELATION_RESULT_PROJECT_REL_CLASS
|
+ ModelConstants.IS_PRODUCED_BY
|
||||||
+ "'";
|
+ "'";
|
||||||
|
|
||||||
Dataset<Row> resproj_relation = spark.sql(resproj_relation_query);
|
Dataset<Row> resproj_relation = spark.sql(resproj_relation_query);
|
||||||
|
|
|
@ -20,6 +20,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
@ -104,11 +105,7 @@ public class SparkResultToProjectThroughSemRelJob {
|
||||||
.stream()
|
.stream()
|
||||||
.forEach(
|
.forEach(
|
||||||
(p -> {
|
(p -> {
|
||||||
if (potential_update
|
potential_update.getProjectSet().remove(p);
|
||||||
.getProjectSet()
|
|
||||||
.contains(p)) {
|
|
||||||
potential_update.getProjectSet().remove(p);
|
|
||||||
}
|
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
String resId = potential_update.getResultId();
|
String resId = potential_update.getResultId();
|
||||||
|
@ -122,9 +119,9 @@ public class SparkResultToProjectThroughSemRelJob {
|
||||||
getRelation(
|
getRelation(
|
||||||
resId,
|
resId,
|
||||||
projectId,
|
projectId,
|
||||||
RELATION_RESULT_PROJECT_REL_CLASS,
|
ModelConstants.IS_PRODUCED_BY,
|
||||||
RELATION_RESULTPROJECT_REL_TYPE,
|
ModelConstants.RESULT_PROJECT,
|
||||||
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
ModelConstants.OUTCOME,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
||||||
|
@ -133,9 +130,9 @@ public class SparkResultToProjectThroughSemRelJob {
|
||||||
getRelation(
|
getRelation(
|
||||||
projectId,
|
projectId,
|
||||||
resId,
|
resId,
|
||||||
RELATION_PROJECT_RESULT_REL_CLASS,
|
ModelConstants.PRODUCES,
|
||||||
RELATION_RESULTPROJECT_REL_TYPE,
|
ModelConstants.RESULT_PROJECT,
|
||||||
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
ModelConstants.OUTCOME,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
||||||
|
|
|
@ -7,6 +7,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
|
@ -17,7 +18,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class PrepareResultCommunitySet {
|
public class PrepareResultCommunitySet {
|
||||||
|
|
||||||
|
@ -55,9 +58,7 @@ public class PrepareResultCommunitySet {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
|
||||||
}
|
|
||||||
prepareInfo(spark, inputPath, outputPath, organizationMap);
|
prepareInfo(spark, inputPath, outputPath, organizationMap);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -76,13 +77,13 @@ public class PrepareResultCommunitySet {
|
||||||
+ " FROM relation "
|
+ " FROM relation "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
+ " AND relClass = '"
|
+ " AND relClass = '"
|
||||||
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
|
+ ModelConstants.HAS_AUTHOR_INSTITUTION
|
||||||
+ "') result_organization "
|
+ "') result_organization "
|
||||||
+ "LEFT JOIN (SELECT source, collect_set(target) org_set "
|
+ "LEFT JOIN (SELECT source, collect_set(target) org_set "
|
||||||
+ " FROM relation "
|
+ " FROM relation "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
+ " AND relClass = '"
|
+ " AND relClass = '"
|
||||||
+ RELATION_REPRESENTATIVERESULT_RESULT_CLASS
|
+ ModelConstants.MERGES
|
||||||
+ "' "
|
+ "' "
|
||||||
+ " GROUP BY source) organization_organization "
|
+ " GROUP BY source) organization_organization "
|
||||||
+ "ON result_organization.target = organization_organization.source ";
|
+ "ON result_organization.target = organization_organization.source ";
|
||||||
|
@ -94,10 +95,24 @@ public class PrepareResultCommunitySet {
|
||||||
result_organizationset
|
result_organizationset
|
||||||
.map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
|
.map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.write()
|
.toJavaRDD()
|
||||||
.mode(SaveMode.Overwrite)
|
.mapToPair(value -> new Tuple2<>(value.getResultId(), value))
|
||||||
.option("compression", "gzip")
|
.reduceByKey((a, b) -> {
|
||||||
.json(outputPath);
|
ArrayList<String> cl = a.getCommunityList();
|
||||||
|
b.getCommunityList().stream().forEach(s -> {
|
||||||
|
if (!cl.contains(s)) {
|
||||||
|
cl.add(s);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
a.setCommunityList(cl);
|
||||||
|
return a;
|
||||||
|
})
|
||||||
|
.map(value -> OBJECT_MAPPER.writeValueAsString(value._2()))
|
||||||
|
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static MapFunction<ResultOrganizations, ResultCommunityList> mapResultCommunityFn(
|
private static MapFunction<ResultOrganizations, ResultCommunityList> mapResultCommunityFn(
|
||||||
|
|
|
@ -68,11 +68,10 @@ public class SparkResultToCommunityFromOrganizationJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
if (saveGraph) {
|
||||||
}
|
|
||||||
if (saveGraph)
|
|
||||||
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
|
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,9 +53,7 @@ public class PrepareResultCommunitySetStep2 {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
|
||||||
}
|
|
||||||
mergeInfo(spark, inputPath, outputPath);
|
mergeInfo(spark, inputPath, outputPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
@ -58,30 +59,15 @@ public class PrepareResultInstRepoAssociation {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
readNeededResources(spark, inputPath);
|
readNeededResources(spark, inputPath);
|
||||||
|
|
||||||
|
removeOutputDir(spark, datasourceOrganizationPath);
|
||||||
prepareDatasourceOrganization(spark, datasourceOrganizationPath);
|
prepareDatasourceOrganization(spark, datasourceOrganizationPath);
|
||||||
|
|
||||||
|
removeOutputDir(spark, alreadyLinkedPath);
|
||||||
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
|
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void prepareAlreadyLinkedAssociation(
|
|
||||||
SparkSession spark, String alreadyLinkedPath) {
|
|
||||||
String query = "Select source resultId, collect_set(target) organizationSet "
|
|
||||||
+ "from relation "
|
|
||||||
+ "where datainfo.deletedbyinference = false "
|
|
||||||
+ "and relClass = '"
|
|
||||||
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
|
|
||||||
+ "' "
|
|
||||||
+ "group by source";
|
|
||||||
|
|
||||||
spark
|
|
||||||
.sql(query)
|
|
||||||
.as(Encoders.bean(ResultOrganizationSet.class))
|
|
||||||
// TODO retry to stick with datasets
|
|
||||||
.toJavaRDD()
|
|
||||||
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
|
||||||
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void readNeededResources(SparkSession spark, String inputPath) {
|
private static void readNeededResources(SparkSession spark, String inputPath) {
|
||||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||||
datasource.createOrReplaceTempView("datasource");
|
datasource.createOrReplaceTempView("datasource");
|
||||||
|
@ -106,7 +92,7 @@ public class PrepareResultInstRepoAssociation {
|
||||||
+ "JOIN ( SELECT source, target "
|
+ "JOIN ( SELECT source, target "
|
||||||
+ "FROM relation "
|
+ "FROM relation "
|
||||||
+ "WHERE relclass = '"
|
+ "WHERE relclass = '"
|
||||||
+ RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
|
+ ModelConstants.IS_PROVIDED_BY
|
||||||
+ "' "
|
+ "' "
|
||||||
+ "AND datainfo.deletedbyinference = false ) rel "
|
+ "AND datainfo.deletedbyinference = false ) rel "
|
||||||
+ "ON d.id = rel.source ";
|
+ "ON d.id = rel.source ";
|
||||||
|
@ -119,4 +105,24 @@ public class PrepareResultInstRepoAssociation {
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(datasourceOrganizationPath);
|
.json(datasourceOrganizationPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void prepareAlreadyLinkedAssociation(
|
||||||
|
SparkSession spark, String alreadyLinkedPath) {
|
||||||
|
String query = "Select source resultId, collect_set(target) organizationSet "
|
||||||
|
+ "from relation "
|
||||||
|
+ "where datainfo.deletedbyinference = false "
|
||||||
|
+ "and relClass = '"
|
||||||
|
+ ModelConstants.HAS_AUTHOR_INSTITUTION
|
||||||
|
+ "' "
|
||||||
|
+ "group by source";
|
||||||
|
|
||||||
|
spark
|
||||||
|
.sql(query)
|
||||||
|
.as(Encoders.bean(ResultOrganizationSet.class))
|
||||||
|
// TODO retry to stick with datasets
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
||||||
|
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
@ -83,10 +84,8 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
// removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
if (saveGraph) {
|
||||||
}
|
|
||||||
if (saveGraph)
|
|
||||||
execPropagation(
|
execPropagation(
|
||||||
spark,
|
spark,
|
||||||
datasourceorganization,
|
datasourceorganization,
|
||||||
|
@ -94,6 +93,7 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
inputPath,
|
inputPath,
|
||||||
outputPath,
|
outputPath,
|
||||||
resultClazz);
|
resultClazz);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -136,9 +136,7 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
.stream()
|
.stream()
|
||||||
.forEach(
|
.forEach(
|
||||||
rId -> {
|
rId -> {
|
||||||
if (organization_list.contains(rId)) {
|
organization_list.remove(rId);
|
||||||
organization_list.remove(rId);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
String resultId = potential_update.getResultId();
|
String resultId = potential_update.getResultId();
|
||||||
|
@ -151,9 +149,9 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
getRelation(
|
getRelation(
|
||||||
orgId,
|
orgId,
|
||||||
resultId,
|
resultId,
|
||||||
RELATION_ORGANIZATION_RESULT_REL_CLASS,
|
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
|
||||||
RELATION_RESULTORGANIZATION_REL_TYPE,
|
ModelConstants.RESULT_ORGANIZATION,
|
||||||
RELATION_RESULTORGANIZATION_SUBREL_TYPE,
|
ModelConstants.AFFILIATION,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
||||||
|
@ -162,9 +160,9 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
getRelation(
|
getRelation(
|
||||||
resultId,
|
resultId,
|
||||||
orgId,
|
orgId,
|
||||||
RELATION_RESULT_ORGANIZATION_REL_CLASS,
|
ModelConstants.HAS_AUTHOR_INSTITUTION,
|
||||||
RELATION_RESULTORGANIZATION_REL_TYPE,
|
ModelConstants.RESULT_ORGANIZATION,
|
||||||
RELATION_RESULTORGANIZATION_SUBREL_TYPE,
|
ModelConstants.AFFILIATION,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
||||||
|
|
|
@ -18,6 +18,17 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -42,8 +53,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -53,8 +62,6 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -64,8 +71,6 @@
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -75,8 +80,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -95,13 +98,11 @@
|
||||||
|
|
||||||
<action name="join_bulktag_publication">
|
<action name="join_bulktag_publication">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>bulkTagging-publication</name>
|
<name>bulkTagging-publication</name>
|
||||||
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
||||||
<jar>dhp-bulktag-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--num-executors=${sparkExecutorNumber}
|
--num-executors=${sparkExecutorNumber}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -124,13 +125,11 @@
|
||||||
|
|
||||||
<action name="join_bulktag_dataset">
|
<action name="join_bulktag_dataset">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>bulkTagging-dataset</name>
|
<name>bulkTagging-dataset</name>
|
||||||
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
||||||
<jar>dhp-bulktag-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--num-executors=${sparkExecutorNumber}
|
--num-executors=${sparkExecutorNumber}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -153,13 +152,11 @@
|
||||||
|
|
||||||
<action name="join_bulktag_otherresearchproduct">
|
<action name="join_bulktag_otherresearchproduct">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>bulkTagging-orp</name>
|
<name>bulkTagging-orp</name>
|
||||||
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
||||||
<jar>dhp-bulktag-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--num-executors=${sparkExecutorNumber}
|
--num-executors=${sparkExecutorNumber}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -182,13 +179,11 @@
|
||||||
|
|
||||||
<action name="join_bulktag_software">
|
<action name="join_bulktag_software">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>bulkTagging-software</name>
|
<name>bulkTagging-software</name>
|
||||||
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
||||||
<jar>dhp-bulktag-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--num-executors=${sparkExecutorNumber}
|
--num-executors=${sparkExecutorNumber}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
|
|
@ -19,6 +19,17 @@
|
||||||
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -43,8 +54,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -54,18 +63,15 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
<ok to="copy_wait"/>
|
<ok to="copy_wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -75,8 +81,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -92,7 +96,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>PrepareDatasourceCountryAssociation</name>
|
<name>PrepareDatasourceCountryAssociation</name>
|
||||||
<class>eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation</class>
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -126,7 +130,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>prepareResultCountry-Publication</name>
|
<name>prepareResultCountry-Publication</name>
|
||||||
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -156,7 +160,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>prepareResultCountry-Dataset</name>
|
<name>prepareResultCountry-Dataset</name>
|
||||||
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -186,7 +190,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>prepareResultCountry-ORP</name>
|
<name>prepareResultCountry-ORP</name>
|
||||||
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -216,7 +220,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>prepareResultCountry-Software</name>
|
<name>prepareResultCountry-Software</name>
|
||||||
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -255,7 +259,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>countryPropagationForPublications</name>
|
<name>countryPropagationForPublications</name>
|
||||||
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -285,7 +289,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>countryPropagationForDataset</name>
|
<name>countryPropagationForDataset</name>
|
||||||
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -315,7 +319,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>countryPropagationForORP</name>
|
<name>countryPropagationForORP</name>
|
||||||
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -345,7 +349,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>countryPropagationForSoftware</name>
|
<name>countryPropagationForSoftware</name>
|
||||||
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
|
|
@ -57,6 +57,7 @@
|
||||||
<ok to="copy_wait"/>
|
<ok to="copy_wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
@ -81,7 +82,6 @@
|
||||||
|
|
||||||
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
|
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
|
||||||
|
|
||||||
|
|
||||||
<fork name="fork_prepare_assoc_step1">
|
<fork name="fork_prepare_assoc_step1">
|
||||||
<path start="join_prepare_publication"/>
|
<path start="join_prepare_publication"/>
|
||||||
<path start="join_prepare_dataset"/>
|
<path start="join_prepare_dataset"/>
|
||||||
|
@ -95,7 +95,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ORCIDPropagation-PreparePhase1-Publications</name>
|
<name>ORCIDPropagation-PreparePhase1-Publications</name>
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -111,16 +111,11 @@
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>${sourcePath}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--hive_metastore_uris</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>${hive_metastore_uris}</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
<arg>--resultTableName</arg>
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
<arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--outputPath</arg>
|
|
||||||
<arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
|
||||||
<arg>--allowedsemrels</arg>
|
|
||||||
<arg>${allowedsemrels}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -132,7 +127,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ORCIDPropagation-PreparePhase1-Dataset</name>
|
<name>ORCIDPropagation-PreparePhase1-Dataset</name>
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -144,16 +139,11 @@
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>${sourcePath}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--hive_metastore_uris</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>${hive_metastore_uris}</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
<arg>--resultTableName</arg>
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
<arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--outputPath</arg>
|
|
||||||
<arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
|
||||||
<arg>--allowedsemrels</arg>
|
|
||||||
<arg>${allowedsemrels}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -165,7 +155,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ORCIDPropagation-PreparePhase1-ORP</name>
|
<name>ORCIDPropagation-PreparePhase1-ORP</name>
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -177,16 +167,11 @@
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>${sourcePath}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--hive_metastore_uris</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>${hive_metastore_uris}</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
<arg>--resultTableName</arg>
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
<arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--outputPath</arg>
|
|
||||||
<arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
|
||||||
<arg>--allowedsemrels</arg>
|
|
||||||
<arg>${allowedsemrels}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -198,7 +183,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ORCIDPropagation-PreparePhase1-Software</name>
|
<name>ORCIDPropagation-PreparePhase1-Software</name>
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -210,16 +195,11 @@
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>${sourcePath}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--hive_metastore_uris</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>${hive_metastore_uris}</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
<arg>--resultTableName</arg>
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
<arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--outputPath</arg>
|
|
||||||
<arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
|
||||||
<arg>--allowedsemrels</arg>
|
|
||||||
<arg>${allowedsemrels}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -233,7 +213,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ORCIDPropagation-PreparePhase2</name>
|
<name>ORCIDPropagation-PreparePhase2</name>
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep2</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep2</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -245,16 +225,13 @@
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg>
|
<arg>--sourcePath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
<arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
<arg>--outputPath</arg>
|
|
||||||
<arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork-join-exec-propagation"/>
|
<ok to="fork-join-exec-propagation"/>
|
||||||
<!-- <ok to="End"/>-->
|
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<fork name="fork-join-exec-propagation">
|
<fork name="fork-join-exec-propagation">
|
||||||
<path start="join_propagate_publication"/>
|
<path start="join_propagate_publication"/>
|
||||||
<path start="join_propagate_dataset"/>
|
<path start="join_propagate_dataset"/>
|
||||||
|
@ -268,7 +245,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ORCIDPropagation-Publication</name>
|
<name>ORCIDPropagation-Publication</name>
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -284,29 +261,24 @@
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
<arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
<arg>--sourcePath</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>${sourcePath}/publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--hive_metastore_uris</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||||
<arg>${hive_metastore_uris}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
<arg>--resultTableName</arg>
|
|
||||||
<arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--outputPath</arg>
|
|
||||||
<arg>${outputPath}/publication</arg>
|
|
||||||
<arg>--saveGraph</arg>
|
|
||||||
<arg>${saveGraph}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_propagate_dataset">
|
<action name="join_propagate_dataset">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ORCIDPropagation-Dataset</name>
|
<name>ORCIDPropagation-Dataset</name>
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -321,29 +293,24 @@
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
<arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
<arg>--sourcePath</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>${sourcePath}/dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--hive_metastore_uris</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||||
<arg>${hive_metastore_uris}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
<arg>--resultTableName</arg>
|
|
||||||
<arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--outputPath</arg>
|
|
||||||
<arg>${outputPath}/dataset</arg>
|
|
||||||
<arg>--saveGraph</arg>
|
|
||||||
<arg>${saveGraph}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_propagate_otherresearchproduct">
|
<action name="join_propagate_otherresearchproduct">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ORCIDPropagation-ORP</name>
|
<name>ORCIDPropagation-ORP</name>
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -358,29 +325,24 @@
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
<arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>--sourcePath</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--hive_metastore_uris</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||||
<arg>${hive_metastore_uris}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
<arg>--resultTableName</arg>
|
|
||||||
<arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--outputPath</arg>
|
|
||||||
<arg>${outputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--saveGraph</arg>
|
|
||||||
<arg>${saveGraph}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_propagate_software">
|
<action name="join_propagate_software">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ORCIDPropagation-Software</name>
|
<name>ORCIDPropagation-Software</name>
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -395,22 +357,19 @@
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
<arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
<arg>--sourcePath</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>${sourcePath}/software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--hive_metastore_uris</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||||
<arg>${hive_metastore_uris}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
<arg>--resultTableName</arg>
|
|
||||||
<arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--outputPath</arg>
|
|
||||||
<arg>${outputPath}/software</arg>
|
|
||||||
<arg>--saveGraph</arg>
|
|
||||||
<arg>${saveGraph}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait2" to="End"/>
|
<join name="wait2" to="End"/>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -14,6 +14,17 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -42,8 +53,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -53,8 +62,6 @@
|
||||||
|
|
||||||
<action name="copy_publication">
|
<action name="copy_publication">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/publication</arg>
|
<arg>${nameNode}/${sourcePath}/publication</arg>
|
||||||
<arg>${nameNode}/${outputPath}/publication</arg>
|
<arg>${nameNode}/${outputPath}/publication</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -64,8 +71,6 @@
|
||||||
|
|
||||||
<action name="copy_dataset">
|
<action name="copy_dataset">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
||||||
<arg>${nameNode}/${outputPath}/dataset</arg>
|
<arg>${nameNode}/${outputPath}/dataset</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -75,8 +80,6 @@
|
||||||
|
|
||||||
<action name="copy_orp">
|
<action name="copy_orp">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -86,28 +89,24 @@
|
||||||
|
|
||||||
<action name="copy_software">
|
<action name="copy_software">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/software</arg>
|
<arg>${nameNode}/${sourcePath}/software</arg>
|
||||||
<arg>${nameNode}/${outputPath}/software</arg>
|
<arg>${nameNode}/${outputPath}/software</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -117,8 +116,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -127,61 +124,60 @@
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait" to="prepare_project_results_association"/>
|
<join name="wait" to="prepare_project_results_association"/>
|
||||||
|
|
||||||
|
<action name="prepare_project_results_association">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>PrepareProjectResultsAssociation</name>
|
||||||
|
<class>eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg>
|
||||||
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="apply_propagation"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="apply_propagation">
|
||||||
<action name="prepare_project_results_association">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<master>yarn</master>
|
||||||
<master>yarn</master>
|
<mode>cluster</mode>
|
||||||
<mode>cluster</mode>
|
<name>ProjectToResultPropagation</name>
|
||||||
<name>PrepareProjectResultsAssociation</name>
|
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
|
||||||
<class>eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation</class>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<spark-opts>
|
||||||
<spark-opts>
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
</spark-opts>
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
</spark-opts>
|
||||||
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
<arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg>
|
||||||
</spark>
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
<ok to="apply_propagation"/>
|
</spark>
|
||||||
<error to="Kill"/>
|
<ok to="End"/>
|
||||||
</action>
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
<action name="apply_propagation">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>ProjectToResultPropagation</name>
|
|
||||||
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
|
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
|
||||||
<arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg>
|
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,17 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -38,8 +49,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -49,8 +58,6 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -60,8 +67,6 @@
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -71,8 +76,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -88,7 +91,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Prepare-Community-Result-Organization</name>
|
<name>Prepare-Community-Result-Organization</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.PrepareResultCommunitySet</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.PrepareResultCommunitySet</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -101,8 +104,8 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--organizationtoresultcommunitymap</arg><arg>${organizationtoresultcommunitymap}</arg>
|
<arg>--organizationtoresultcommunitymap</arg><arg>${organizationtoresultcommunitymap}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork-join-exec-propagation"/>
|
<ok to="fork-join-exec-propagation"/>
|
||||||
|
@ -122,7 +125,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>community2resultfromorganization-Publication</name>
|
<name>community2resultfromorganization-Publication</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -136,9 +139,9 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
|
@ -151,7 +154,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>community2resultfromorganization-Dataset</name>
|
<name>community2resultfromorganization-Dataset</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -165,9 +168,9 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
|
@ -180,7 +183,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>community2resultfromorganization-ORP</name>
|
<name>community2resultfromorganization-ORP</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -194,9 +197,9 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
|
@ -209,7 +212,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>community2resultfromorganization-Software</name>
|
<name>community2resultfromorganization-Software</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -223,9 +226,9 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
|
|
|
@ -99,7 +99,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ResultToCommunitySemRel-PreparePhase1-Publications</name>
|
<name>ResultToCommunitySemRel-PreparePhase1-Publications</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -128,7 +128,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ResultToCommunitySemRel-PreparePhase1-Dataset</name>
|
<name>ResultToCommunitySemRel-PreparePhase1-Dataset</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -150,13 +150,14 @@
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_prepare_otherresearchproduct">
|
<action name="join_prepare_otherresearchproduct">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ResultToCommunitySemRel-PreparePhase1-ORP</name>
|
<name>ResultToCommunitySemRel-PreparePhase1-ORP</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -185,7 +186,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ResultToCommunitySemRel-PreparePhase1-Software</name>
|
<name>ResultToCommunitySemRel-PreparePhase1-Software</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -216,7 +217,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ResultToCommunityEmRelPropagation-PreparePhase2</name>
|
<name>ResultToCommunityEmRelPropagation-PreparePhase2</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep2</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep2</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -232,9 +233,7 @@
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork-join-exec-propagation"/>
|
<ok to="fork-join-exec-propagation"/>
|
||||||
<!-- <ok to="End"/>-->
|
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<fork name="fork-join-exec-propagation">
|
<fork name="fork-join-exec-propagation">
|
||||||
|
@ -250,7 +249,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Result2CommunitySemRelPropagation-Publication</name>
|
<name>Result2CommunitySemRelPropagation-Publication</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -279,7 +278,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Result2CommunitySemRelPropagation-Dataset</name>
|
<name>Result2CommunitySemRelPropagation-Dataset</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -308,7 +307,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Result2CommunitySemRelPropagation-ORP</name>
|
<name>Result2CommunitySemRelPropagation-ORP</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -337,7 +336,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Result2CommunitySemRelPropagation-Software</name>
|
<name>Result2CommunitySemRelPropagation-Software</name>
|
||||||
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob</class>
|
<class>eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
|
|
@ -10,6 +10,17 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -38,8 +49,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -49,8 +58,6 @@
|
||||||
|
|
||||||
<action name="copy_publication">
|
<action name="copy_publication">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/publication</arg>
|
<arg>${nameNode}/${sourcePath}/publication</arg>
|
||||||
<arg>${nameNode}/${outputPath}/publication</arg>
|
<arg>${nameNode}/${outputPath}/publication</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -60,8 +67,6 @@
|
||||||
|
|
||||||
<action name="copy_dataset">
|
<action name="copy_dataset">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
||||||
<arg>${nameNode}/${outputPath}/dataset</arg>
|
<arg>${nameNode}/${outputPath}/dataset</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -71,8 +76,6 @@
|
||||||
|
|
||||||
<action name="copy_orp">
|
<action name="copy_orp">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -82,8 +85,6 @@
|
||||||
|
|
||||||
<action name="copy_software">
|
<action name="copy_software">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/software</arg>
|
<arg>${nameNode}/${sourcePath}/software</arg>
|
||||||
<arg>${nameNode}/${outputPath}/software</arg>
|
<arg>${nameNode}/${outputPath}/software</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -93,8 +94,6 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -104,8 +103,6 @@
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -115,8 +112,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -125,13 +120,14 @@
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait" to="prepare_result_organization_association"/>
|
<join name="wait" to="prepare_result_organization_association"/>
|
||||||
|
|
||||||
<action name="prepare_result_organization_association">
|
<action name="prepare_result_organization_association">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>PrepareResultOrganizationAssociation</name>
|
<name>PrepareResultOrganizationAssociation</name>
|
||||||
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.PrepareResultInstRepoAssociation</class>
|
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.PrepareResultInstRepoAssociation</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -163,7 +159,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>resultToOrganizationFromInstRepoPropagationForPublications</name>
|
<name>resultToOrganizationFromInstRepoPropagationForPublications</name>
|
||||||
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob</class>
|
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -176,12 +172,12 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -193,7 +189,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>resultToOrganizationFromInstRepoPropagationForDataset</name>
|
<name>resultToOrganizationFromInstRepoPropagationForDataset</name>
|
||||||
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob</class>
|
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -206,12 +202,12 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -223,7 +219,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>resultToOrganizationFromInstRepoPropagationForORP</name>
|
<name>resultToOrganizationFromInstRepoPropagationForORP</name>
|
||||||
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob</class>
|
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -236,12 +232,12 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -253,7 +249,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>resultToOrganizationFromInstRepoPropagationForSoftware</name>
|
<name>resultToOrganizationFromInstRepoPropagationForSoftware</name>
|
||||||
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob</class>
|
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob</class>
|
||||||
<jar>dhp-propagation-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -266,12 +262,12 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue