forked from D-Net/dnet-hadoop
WIP merged from graph_cleaning_refactoring, applying model simplification
This commit is contained in:
commit
8c9a77d7eb
|
@ -86,121 +86,10 @@ public class ModelConstants {
|
||||||
public static final Qualifier PROVENANCE_ACTION_SET_QUALIFIER = qualifier(
|
public static final Qualifier PROVENANCE_ACTION_SET_QUALIFIER = qualifier(
|
||||||
SYSIMPORT_ACTIONSET, SYSIMPORT_ACTIONSET, DNET_PROVENANCE_ACTIONS);
|
SYSIMPORT_ACTIONSET, SYSIMPORT_ACTIONSET, DNET_PROVENANCE_ACTIONS);
|
||||||
|
|
||||||
public static final String DATASET_RESULTTYPE_CLASSID = "dataset";
|
|
||||||
public static final String PUBLICATION_RESULTTYPE_CLASSID = "publication";
|
|
||||||
public static final String SOFTWARE_RESULTTYPE_CLASSID = "software";
|
|
||||||
public static final String ORP_RESULTTYPE_CLASSID = "other";
|
|
||||||
|
|
||||||
public static final String RESULT_RESULT = "resultResult"; // relType
|
|
||||||
/**
|
|
||||||
* @deprecated Use {@link ModelConstants#RELATIONSHIP} instead.
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public static final String PUBLICATION_DATASET = "publicationDataset"; // subreltype
|
|
||||||
|
|
||||||
public static final String SUPPLEMENT = "supplement"; // subreltype
|
|
||||||
public static final String IS_SUPPLEMENT_TO = "IsSupplementTo";
|
|
||||||
public static final String IS_SUPPLEMENTED_BY = "IsSupplementedBy";
|
|
||||||
|
|
||||||
public static final String PART = "part"; // subreltype
|
|
||||||
public static final String IS_PART_OF = "IsPartOf";
|
|
||||||
public static final String HAS_PART = "HasPart";
|
|
||||||
|
|
||||||
public static final String RELATIONSHIP = "relationship"; // subreltype
|
|
||||||
|
|
||||||
public static final String IS_RELATED_TO = "IsRelatedTo";
|
|
||||||
public static final String IS_IDENTICAL_TO = "IsIdenticalTo";
|
|
||||||
|
|
||||||
public static final String REFERENCES = "References";
|
|
||||||
public static final String IS_REFERENCED_BY = "IsReferencedBy";
|
|
||||||
public static final String CONTINUES = "Continues";
|
|
||||||
public static final String IS_CONTINUED_BY = "IsContinuedBy";
|
|
||||||
public static final String DOCUMENTS = "Documents";
|
|
||||||
public static final String IS_DOCUMENTED_BY = "IsDocumentedBy";
|
|
||||||
public static final String IS_SOURCE_OF = "IsSourceOf";
|
|
||||||
public static final String IS_DERIVED_FROM = "IsDerivedFrom";
|
|
||||||
public static final String COMPILES = "Compiles";
|
|
||||||
public static final String IS_COMPILED_BY = "IsCompiledBy";
|
|
||||||
public static final String DESCRIBES = "Describes";
|
|
||||||
public static final String IS_DESCRIBED_BY = "IsDescribedBy";
|
|
||||||
public static final String IS_METADATA_FOR = "IsMetadataFor";
|
|
||||||
public static final String IS_METADATA_OF = "IsMetadataOf";
|
|
||||||
public static final String HAS_ASSOCIATION_WITH = "HasAssociationWith";
|
|
||||||
public static final String IS_REQUIRED_BY = "IsRequiredBy";
|
|
||||||
public static final String REQUIRES = "Requires";
|
|
||||||
|
|
||||||
public static final String CITATION = "citation"; // subreltype
|
|
||||||
public static final String CITES = "Cites";
|
|
||||||
public static final String IS_CITED_BY = "IsCitedBy";
|
|
||||||
|
|
||||||
public static final String REVIEW = "review"; // subreltype
|
|
||||||
public static final String REVIEWS = "Reviews";
|
|
||||||
public static final String IS_REVIEWED_BY = "IsReviewedBy";
|
|
||||||
|
|
||||||
public static final String VERSION = "version"; // subreltype
|
|
||||||
public static final String IS_VERSION_OF = "IsVersionOf";
|
|
||||||
public static final String HAS_VERSION = "HasVersion";
|
|
||||||
public static final String IS_PREVIOUS_VERSION_OF = "IsPreviousVersionOf";
|
|
||||||
public static final String IS_NEW_VERSION_OF = "IsNewVersionOf";
|
|
||||||
public static final String IS_VARIANT_FORM_OF = "IsVariantFormOf";
|
|
||||||
public static final String IS_ORIGINAL_FORM_OF = "IsOriginalFormOf";
|
|
||||||
public static final String IS_OBSOLETED_BY = "IsObsoletedBy";
|
|
||||||
public static final String OBSOLETES = "Obsoletes";
|
|
||||||
|
|
||||||
public static final String RESULT_PROJECT = "resultProject"; // relType
|
|
||||||
public static final String OUTCOME = "outcome"; // subreltype
|
|
||||||
public static final String IS_PRODUCED_BY = "isProducedBy";
|
|
||||||
public static final String PRODUCES = "produces";
|
|
||||||
|
|
||||||
public static final String DATASOURCE_ORGANIZATION = "datasourceOrganization"; // relType
|
|
||||||
public static final String PROVISION = "provision"; // subreltype
|
|
||||||
public static final String IS_PROVIDED_BY = "isProvidedBy";
|
|
||||||
public static final String PROVIDES = "provides";
|
|
||||||
|
|
||||||
public static final String PROJECT_ORGANIZATION = "projectOrganization"; // relType
|
|
||||||
public static final String PARTICIPATION = "participation"; // subreltype
|
|
||||||
public static final String HAS_PARTICIPANT = "hasParticipant";
|
|
||||||
public static final String IS_PARTICIPANT = "isParticipant";
|
|
||||||
|
|
||||||
public static final String RESULT_ORGANIZATION = "resultOrganization"; // relType
|
|
||||||
public static final String AFFILIATION = "affiliation"; // subreltype
|
|
||||||
public static final String IS_AUTHOR_INSTITUTION_OF = "isAuthorInstitutionOf";
|
|
||||||
public static final String HAS_AUTHOR_INSTITUTION = "hasAuthorInstitution";
|
|
||||||
|
|
||||||
public static final String ORG_ORG_RELTYPE = "organizationOrganization"; // relType
|
|
||||||
public static final String IS_PARENT_OF = "IsParentOf";
|
|
||||||
public static final String IS_CHILD_OF = "IsChildOf";
|
|
||||||
|
|
||||||
public static final String DEDUP = "dedup"; // subreltype
|
|
||||||
public static final String MERGES = "merges";
|
|
||||||
public static final String IS_MERGED_IN = "isMergedIn";
|
|
||||||
|
|
||||||
public static final String SIMILARITY = "similarity"; // subreltype
|
|
||||||
public static final String IS_SIMILAR_TO = "isSimilarTo";
|
|
||||||
public static final String IS_AMONG_TOP_N_SIMILAR_DOCS = "IsAmongTopNSimilarDocuments";
|
|
||||||
public static final String HAS_AMONG_TOP_N_SIMILAR_DOCS = "HasAmongTopNSimilarDocuments";
|
|
||||||
|
|
||||||
public static final String IS_DIFFERENT_FROM = "isDifferentFrom";
|
|
||||||
|
|
||||||
public static final String UNKNOWN = "UNKNOWN";
|
public static final String UNKNOWN = "UNKNOWN";
|
||||||
public static final String NOT_AVAILABLE = "not available";
|
public static final String NOT_AVAILABLE = "not available";
|
||||||
|
|
||||||
public static final Qualifier PUBLICATION_DEFAULT_RESULTTYPE = qualifier(
|
|
||||||
PUBLICATION_RESULTTYPE_CLASSID, PUBLICATION_RESULTTYPE_CLASSID,
|
|
||||||
DNET_RESULT_TYPOLOGIES);
|
|
||||||
|
|
||||||
public static final Qualifier DATASET_DEFAULT_RESULTTYPE = qualifier(
|
|
||||||
DATASET_RESULTTYPE_CLASSID, DATASET_RESULTTYPE_CLASSID,
|
|
||||||
DNET_RESULT_TYPOLOGIES);
|
|
||||||
|
|
||||||
public static final Qualifier SOFTWARE_DEFAULT_RESULTTYPE = qualifier(
|
|
||||||
SOFTWARE_RESULTTYPE_CLASSID, SOFTWARE_RESULTTYPE_CLASSID,
|
|
||||||
DNET_RESULT_TYPOLOGIES);
|
|
||||||
|
|
||||||
public static final Qualifier ORP_DEFAULT_RESULTTYPE = qualifier(
|
|
||||||
ORP_RESULTTYPE_CLASSID, ORP_RESULTTYPE_CLASSID,
|
|
||||||
DNET_RESULT_TYPOLOGIES);
|
|
||||||
|
|
||||||
public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = qualifier(
|
public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = qualifier(
|
||||||
SYSIMPORT_CROSSWALK_REPOSITORY, SYSIMPORT_CROSSWALK_REPOSITORY,
|
SYSIMPORT_CROSSWALK_REPOSITORY, SYSIMPORT_CROSSWALK_REPOSITORY,
|
||||||
DNET_PROVENANCE_ACTIONS);
|
DNET_PROVENANCE_ACTIONS);
|
||||||
|
|
|
@ -96,48 +96,6 @@ public class ModelSupport {
|
||||||
idPrefixEntity.put("50", "result");
|
idPrefixEntity.put("50", "result");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
|
|
||||||
|
|
||||||
static {
|
|
||||||
set(relationInverseMap, PROJECT_ORGANIZATION, PARTICIPATION, IS_PARTICIPANT, HAS_PARTICIPANT);
|
|
||||||
|
|
||||||
set(relationInverseMap, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF, HAS_AUTHOR_INSTITUTION);
|
|
||||||
|
|
||||||
set(relationInverseMap, ORG_ORG_RELTYPE, DEDUP, IS_MERGED_IN, MERGES);
|
|
||||||
set(relationInverseMap, ORG_ORG_RELTYPE, DEDUP, IS_SIMILAR_TO, IS_SIMILAR_TO);
|
|
||||||
|
|
||||||
set(relationInverseMap, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, PRODUCES);
|
|
||||||
|
|
||||||
set(relationInverseMap, DATASOURCE_ORGANIZATION, PROVISION, IS_PROVIDED_BY, PROVIDES);
|
|
||||||
|
|
||||||
set(relationInverseMap, RESULT_RESULT, SIMILARITY, IS_AMONG_TOP_N_SIMILAR_DOCS, HAS_AMONG_TOP_N_SIMILAR_DOCS);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, IS_SUPPLEMENTED_BY);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, PART, IS_PART_OF, HAS_PART);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, DEDUP, IS_MERGED_IN, MERGES);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, DEDUP, IS_SIMILAR_TO, IS_SIMILAR_TO);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, CITATION, IS_CITED_BY, CITES);
|
|
||||||
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_IDENTICAL_TO, IS_IDENTICAL_TO);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_REFERENCED_BY, REFERENCES);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_CONTINUED_BY, CONTINUES);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_DOCUMENTED_BY, DOCUMENTS);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_DERIVED_FROM, IS_SOURCE_OF);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, IS_RELATED_TO);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_COMPILED_BY, COMPILES);
|
|
||||||
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_DESCRIBED_BY, DESCRIBES);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_METADATA_FOR, IS_METADATA_OF);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, HAS_ASSOCIATION_WITH, HAS_ASSOCIATION_WITH);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_REQUIRED_BY, REQUIRES);
|
|
||||||
|
|
||||||
set(relationInverseMap, RESULT_RESULT, VERSION, IS_PREVIOUS_VERSION_OF, IS_NEW_VERSION_OF);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, VERSION, IS_VARIANT_FORM_OF, IS_ORIGINAL_FORM_OF);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, VERSION, IS_OBSOLETED_BY, OBSOLETES);
|
|
||||||
set(relationInverseMap, RESULT_RESULT, VERSION, IS_VERSION_OF, HAS_VERSION);
|
|
||||||
|
|
||||||
set(relationInverseMap, RESULT_RESULT, REVIEW, IS_REVIEWED_BY, REVIEWS);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void set(Map<String, RelationInverse> relationInverseMap, String relType, String subRelType,
|
private static void set(Map<String, RelationInverse> relationInverseMap, String relType, String subRelType,
|
||||||
String relClass, String inverseRelClass) {
|
String relClass, String inverseRelClass) {
|
||||||
relationInverseMap
|
relationInverseMap
|
||||||
|
@ -158,35 +116,6 @@ public class ModelSupport {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper method: lookup relation inverse, given the direct relation encoding (case insensitive)
|
|
||||||
* @param encoding
|
|
||||||
* @return the relation inverse descriptor, throws @IllegalArgumentException when not found.
|
|
||||||
*/
|
|
||||||
public static RelationInverse findInverse(String encoding) {
|
|
||||||
return ModelSupport.relationInverseMap
|
|
||||||
.entrySet()
|
|
||||||
.stream()
|
|
||||||
.filter(r -> encoding.equalsIgnoreCase(r.getKey()))
|
|
||||||
.findFirst()
|
|
||||||
.map(r -> r.getValue())
|
|
||||||
.orElseThrow(() -> new IllegalArgumentException("invalid relationship: " + encoding));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper method: fina a relation filtering by a relation name
|
|
||||||
* @param relationName
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public static RelationInverse findRelation(final String relationName) {
|
|
||||||
return relationInverseMap
|
|
||||||
.values()
|
|
||||||
.stream()
|
|
||||||
.filter(r -> relationName.equalsIgnoreCase(r.getRelClass()))
|
|
||||||
.findFirst()
|
|
||||||
.orElse(null);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper method: combines the relation attributes
|
* Helper method: combines the relation attributes
|
||||||
* @param relType
|
* @param relType
|
||||||
|
@ -364,17 +293,17 @@ public class ModelSupport {
|
||||||
.join(
|
.join(
|
||||||
source,
|
source,
|
||||||
target,
|
target,
|
||||||
relType,
|
relType.toString(),
|
||||||
subRelType,
|
subRelType.toString(),
|
||||||
relClass))
|
relClass.toString()))
|
||||||
.orElse(
|
.orElse(
|
||||||
String
|
String
|
||||||
.join(
|
.join(
|
||||||
source,
|
source,
|
||||||
target,
|
target,
|
||||||
relType,
|
relType.toString(),
|
||||||
subRelType)))
|
subRelType.toString())))
|
||||||
.orElse(String.join(source, target, relType)))
|
.orElse(String.join(source, target, relType.toString())))
|
||||||
.orElse(String.join(source, target)))
|
.orElse(String.join(source, target)))
|
||||||
.orElse(source))
|
.orElse(source))
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
|
|
|
@ -16,6 +16,8 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
@ -38,6 +40,127 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
|
|
||||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
||||||
|
|
||||||
|
public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) {
|
||||||
|
if (ModelSupport.isSubClass(value, Result.class)) {
|
||||||
|
final Result res = (Result) value;
|
||||||
|
if (shouldCleanContext(res, verifyParam)) {
|
||||||
|
res
|
||||||
|
.setContext(
|
||||||
|
res
|
||||||
|
.getContext()
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
return (T) res;
|
||||||
|
} else {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean shouldCleanContext(Result res, String verifyParam) {
|
||||||
|
boolean titleMatch = res
|
||||||
|
.getTitle()
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
t -> t
|
||||||
|
.getQualifier()
|
||||||
|
.getClassid()
|
||||||
|
.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
|
||||||
|
.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()));
|
||||||
|
|
||||||
|
return titleMatch && Objects.nonNull(res.getContext());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> T cleanCountry(T value, String[] verifyParam, Set<String> hostedBy,
|
||||||
|
String collectedfrom, String country) {
|
||||||
|
if (ModelSupport.isSubClass(value, Result.class)) {
|
||||||
|
final Result res = (Result) value;
|
||||||
|
if (res.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
|
||||||
|
!res.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
|
||||||
|
return (T) res;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<StructuredProperty> ids = getPidsAndAltIds(res).collect(Collectors.toList());
|
||||||
|
if (ids
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
p -> p
|
||||||
|
.getQualifier()
|
||||||
|
.getClassid()
|
||||||
|
.equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) {
|
||||||
|
res
|
||||||
|
.setCountry(
|
||||||
|
res
|
||||||
|
.getCountry()
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
c -> toTakeCountry(c, country))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return (T) res;
|
||||||
|
} else {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
|
||||||
|
final Stream<StructuredProperty> resultPids = Optional
|
||||||
|
.ofNullable(r.getPid())
|
||||||
|
.map(Collection::stream)
|
||||||
|
.orElse(Stream.empty());
|
||||||
|
|
||||||
|
final Stream<StructuredProperty> instancePids = Optional
|
||||||
|
.ofNullable(r.getInstance())
|
||||||
|
.map(
|
||||||
|
instance -> instance
|
||||||
|
.stream()
|
||||||
|
.flatMap(
|
||||||
|
i -> Optional
|
||||||
|
.ofNullable(i.getPid())
|
||||||
|
.map(Collection::stream)
|
||||||
|
.orElse(Stream.empty())))
|
||||||
|
.orElse(Stream.empty());
|
||||||
|
|
||||||
|
final Stream<StructuredProperty> instanceAltIds = Optional
|
||||||
|
.ofNullable(r.getInstance())
|
||||||
|
.map(
|
||||||
|
instance -> instance
|
||||||
|
.stream()
|
||||||
|
.flatMap(
|
||||||
|
i -> Optional
|
||||||
|
.ofNullable(i.getAlternateIdentifier())
|
||||||
|
.map(Collection::stream)
|
||||||
|
.orElse(Stream.empty())))
|
||||||
|
.orElse(Stream.empty());
|
||||||
|
|
||||||
|
return Stream
|
||||||
|
.concat(
|
||||||
|
Stream.concat(resultPids, instancePids),
|
||||||
|
instanceAltIds);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean pidInParam(String value, String[] verifyParam) {
|
||||||
|
for (String s : verifyParam)
|
||||||
|
if (value.startsWith(s))
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean toTakeCountry(Country c, String country) {
|
||||||
|
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
|
||||||
|
// inserted via propagation
|
||||||
|
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
|
||||||
|
return true;
|
||||||
|
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
|
||||||
|
return true;
|
||||||
|
return !(c
|
||||||
|
.getClassid()
|
||||||
|
.equalsIgnoreCase(country) &&
|
||||||
|
c.getDataInfo().getInferenceprovenance().equals("propagation"));
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||||
if (value instanceof Datasource) {
|
if (value instanceof Datasource) {
|
||||||
// nothing to clean here
|
// nothing to clean here
|
||||||
|
|
|
@ -18,443 +18,443 @@ import eu.dnetlib.dhp.schema.oaf.common.AccessRightComparator;
|
||||||
|
|
||||||
public class OafMapperUtils {
|
public class OafMapperUtils {
|
||||||
|
|
||||||
private OafMapperUtils() {
|
private OafMapperUtils() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static KeyValue keyValue(final String k, final String v) {
|
public static KeyValue keyValue(final String k, final String v) {
|
||||||
final KeyValue kv = new KeyValue();
|
final KeyValue kv = new KeyValue();
|
||||||
kv.setKey(k);
|
kv.setKey(k);
|
||||||
kv.setValue(v);
|
kv.setValue(v);
|
||||||
return kv;
|
return kv;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<KeyValue> listKeyValues(final String... s) {
|
public static List<KeyValue> listKeyValues(final String... s) {
|
||||||
if (s.length % 2 > 0) {
|
if (s.length % 2 > 0) {
|
||||||
throw new IllegalArgumentException("Invalid number of parameters (k,v,k,v,....)");
|
throw new IllegalArgumentException("Invalid number of parameters (k,v,k,v,....)");
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<KeyValue> list = new ArrayList<>();
|
final List<KeyValue> list = new ArrayList<>();
|
||||||
for (int i = 0; i < s.length; i += 2) {
|
for (int i = 0; i < s.length; i += 2) {
|
||||||
list.add(keyValue(s[i], s[i + 1]));
|
list.add(keyValue(s[i], s[i + 1]));
|
||||||
}
|
}
|
||||||
return list;
|
return list;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T> List<T> listValues(Array values) throws SQLException {
|
public static <T> List<T> listValues(Array values) throws SQLException {
|
||||||
if (Objects.isNull(values)) {
|
if (Objects.isNull(values)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return Arrays
|
return Arrays
|
||||||
.stream((T[]) values.getArray())
|
.stream((T[]) values.getArray())
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.distinct()
|
.distinct()
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Qualifier unknown(final String schemeid) {
|
public static Qualifier unknown(final String schemeid) {
|
||||||
return qualifier(UNKNOWN, "Unknown", schemeid);
|
return qualifier(UNKNOWN, "Unknown", schemeid);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static AccessRight accessRight(
|
public static AccessRight accessRight(
|
||||||
final String classid,
|
final String classid,
|
||||||
final String classname,
|
final String classname,
|
||||||
final String schemeid) {
|
final String schemeid) {
|
||||||
return accessRight(classid, classname, schemeid, null);
|
return accessRight(classid, classname, schemeid, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static AccessRight accessRight(
|
public static AccessRight accessRight(
|
||||||
final String classid,
|
final String classid,
|
||||||
final String classname,
|
final String classname,
|
||||||
final String schemeid,
|
final String schemeid,
|
||||||
final OpenAccessRoute openAccessRoute) {
|
final OpenAccessRoute openAccessRoute) {
|
||||||
final AccessRight accessRight = new AccessRight();
|
final AccessRight accessRight = new AccessRight();
|
||||||
accessRight.setClassid(classid);
|
accessRight.setClassid(classid);
|
||||||
accessRight.setClassname(classname);
|
accessRight.setClassname(classname);
|
||||||
accessRight.setSchemeid(schemeid);
|
accessRight.setSchemeid(schemeid);
|
||||||
accessRight.setOpenAccessRoute(openAccessRoute);
|
accessRight.setOpenAccessRoute(openAccessRoute);
|
||||||
return accessRight;
|
return accessRight;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Qualifier qualifier(
|
public static Qualifier qualifier(
|
||||||
final String classid,
|
final String classid,
|
||||||
final String classname,
|
final String classname,
|
||||||
final String schemeid) {
|
final String schemeid) {
|
||||||
final Qualifier q = new Qualifier();
|
final Qualifier q = new Qualifier();
|
||||||
q.setClassid(classid);
|
q.setClassid(classid);
|
||||||
q.setClassname(classname);
|
q.setClassname(classname);
|
||||||
q.setSchemeid(schemeid);
|
q.setSchemeid(schemeid);
|
||||||
return q;
|
return q;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Qualifier qualifier(final Qualifier qualifier) {
|
public static Qualifier qualifier(final Qualifier qualifier) {
|
||||||
final Qualifier q = new Qualifier();
|
final Qualifier q = new Qualifier();
|
||||||
q.setClassid(qualifier.getClassid());
|
q.setClassid(qualifier.getClassid());
|
||||||
q.setClassname(qualifier.getClassname());
|
q.setClassname(qualifier.getClassname());
|
||||||
q.setSchemeid(qualifier.getSchemeid());
|
q.setSchemeid(qualifier.getSchemeid());
|
||||||
return q;
|
return q;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Subject subject(
|
public static Subject subject(
|
||||||
final String value,
|
final String value,
|
||||||
final String classid,
|
final String classid,
|
||||||
final String classname,
|
final String classname,
|
||||||
final String schemeid,
|
final String schemeid,
|
||||||
final DataInfo dataInfo) {
|
final DataInfo dataInfo) {
|
||||||
|
|
||||||
return subject(value, qualifier(classid, classname, schemeid), dataInfo);
|
return subject(value, qualifier(classid, classname, schemeid), dataInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static StructuredProperty structuredProperty(
|
public static StructuredProperty structuredProperty(
|
||||||
final String value,
|
final String value,
|
||||||
final String classid,
|
final String classid,
|
||||||
final String classname,
|
final String classname,
|
||||||
final String schemeid) {
|
final String schemeid) {
|
||||||
|
|
||||||
return structuredProperty(value, qualifier(classid, classname, schemeid));
|
return structuredProperty(value, qualifier(classid, classname, schemeid));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Subject subject(
|
public static Subject subject(
|
||||||
final String value,
|
final String value,
|
||||||
final Qualifier qualifier,
|
final Qualifier qualifier,
|
||||||
final DataInfo dataInfo) {
|
final DataInfo dataInfo) {
|
||||||
if (value == null) {
|
if (value == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
final Subject s = new Subject();
|
final Subject s = new Subject();
|
||||||
s.setValue(value);
|
s.setValue(value);
|
||||||
s.setQualifier(qualifier);
|
s.setQualifier(qualifier);
|
||||||
s.setDataInfo(dataInfo);
|
s.setDataInfo(dataInfo);
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static StructuredProperty structuredProperty(
|
public static StructuredProperty structuredProperty(
|
||||||
final String value,
|
final String value,
|
||||||
final Qualifier qualifier) {
|
final Qualifier qualifier) {
|
||||||
if (value == null) {
|
if (value == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
final StructuredProperty sp = new StructuredProperty();
|
final StructuredProperty sp = new StructuredProperty();
|
||||||
sp.setValue(value);
|
sp.setValue(value);
|
||||||
sp.setQualifier(qualifier);
|
sp.setQualifier(qualifier);
|
||||||
return sp;
|
return sp;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Publisher publisher(final String name) {
|
public static Publisher publisher(final String name) {
|
||||||
final Publisher p = new Publisher();
|
final Publisher p = new Publisher();
|
||||||
p.setName(name);
|
p.setName(name);
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static License license(final String url) {
|
public static License license(final String url) {
|
||||||
final License l = new License();
|
final License l = new License();
|
||||||
l.setUrl(url);
|
l.setUrl(url);
|
||||||
return l;
|
return l;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static AuthorPid authorPid(
|
public static AuthorPid authorPid(
|
||||||
final String value,
|
final String value,
|
||||||
final Qualifier qualifier,
|
final Qualifier qualifier,
|
||||||
final DataInfo dataInfo) {
|
final DataInfo dataInfo) {
|
||||||
if (value == null) {
|
if (value == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
final AuthorPid ap = new AuthorPid();
|
final AuthorPid ap = new AuthorPid();
|
||||||
ap.setValue(value);
|
ap.setValue(value);
|
||||||
ap.setQualifier(qualifier);
|
ap.setQualifier(qualifier);
|
||||||
ap.setDataInfo(dataInfo);
|
ap.setDataInfo(dataInfo);
|
||||||
return ap;
|
return ap;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static AuthorPid authorPid(
|
public static AuthorPid authorPid(
|
||||||
final String value,
|
final String value,
|
||||||
final String classid,
|
final String classid,
|
||||||
final String schemeid,
|
final String schemeid,
|
||||||
final DataInfo dataInfo) {
|
final DataInfo dataInfo) {
|
||||||
if (value == null) {
|
if (value == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
final AuthorPid ap = new AuthorPid();
|
final AuthorPid ap = new AuthorPid();
|
||||||
ap.setValue(value);
|
ap.setValue(value);
|
||||||
ap.setQualifier(qualifier(classid, classid, schemeid));
|
ap.setQualifier(qualifier(classid, classid, schemeid));
|
||||||
ap.setDataInfo(dataInfo);
|
ap.setDataInfo(dataInfo);
|
||||||
return ap;
|
return ap;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ExtraInfo extraInfo(
|
public static ExtraInfo extraInfo(
|
||||||
final String name,
|
final String name,
|
||||||
final String value,
|
final String value,
|
||||||
final String typology,
|
final String typology,
|
||||||
final String provenance,
|
final String provenance,
|
||||||
final String trust) {
|
final String trust) {
|
||||||
final ExtraInfo info = new ExtraInfo();
|
final ExtraInfo info = new ExtraInfo();
|
||||||
info.setName(name);
|
info.setName(name);
|
||||||
info.setValue(value);
|
info.setValue(value);
|
||||||
info.setTypology(typology);
|
info.setTypology(typology);
|
||||||
info.setProvenance(provenance);
|
info.setProvenance(provenance);
|
||||||
info.setTrust(trust);
|
info.setTrust(trust);
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static OAIProvenance oaiIProvenance(
|
public static OAIProvenance oaiIProvenance(
|
||||||
final String identifier,
|
final String identifier,
|
||||||
final String baseURL,
|
final String baseURL,
|
||||||
final String metadataNamespace,
|
final String metadataNamespace,
|
||||||
final Boolean altered,
|
final Boolean altered,
|
||||||
final String datestamp,
|
final String datestamp,
|
||||||
final String harvestDate) {
|
final String harvestDate) {
|
||||||
|
|
||||||
final OriginDescription desc = new OriginDescription();
|
final OriginDescription desc = new OriginDescription();
|
||||||
desc.setIdentifier(identifier);
|
desc.setIdentifier(identifier);
|
||||||
desc.setBaseURL(baseURL);
|
desc.setBaseURL(baseURL);
|
||||||
desc.setMetadataNamespace(metadataNamespace);
|
desc.setMetadataNamespace(metadataNamespace);
|
||||||
desc.setAltered(altered);
|
desc.setAltered(altered);
|
||||||
desc.setDatestamp(datestamp);
|
desc.setDatestamp(datestamp);
|
||||||
desc.setHarvestDate(harvestDate);
|
desc.setHarvestDate(harvestDate);
|
||||||
|
|
||||||
final OAIProvenance p = new OAIProvenance();
|
final OAIProvenance p = new OAIProvenance();
|
||||||
p.setOriginDescription(desc);
|
p.setOriginDescription(desc);
|
||||||
|
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Journal journal(
|
public static Journal journal(
|
||||||
final String name,
|
final String name,
|
||||||
final String issnPrinted,
|
final String issnPrinted,
|
||||||
final String issnOnline,
|
final String issnOnline,
|
||||||
final String issnLinking) {
|
final String issnLinking) {
|
||||||
|
|
||||||
return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal(
|
return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal(
|
||||||
name,
|
name,
|
||||||
issnPrinted,
|
issnPrinted,
|
||||||
issnOnline,
|
issnOnline,
|
||||||
issnLinking,
|
issnLinking,
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
null) : null;
|
null) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Journal journal(
|
public static Journal journal(
|
||||||
final String name,
|
final String name,
|
||||||
final String issnPrinted,
|
final String issnPrinted,
|
||||||
final String issnOnline,
|
final String issnOnline,
|
||||||
final String issnLinking,
|
final String issnLinking,
|
||||||
final String ep,
|
final String ep,
|
||||||
final String iss,
|
final String iss,
|
||||||
final String sp,
|
final String sp,
|
||||||
final String vol,
|
final String vol,
|
||||||
final String edition,
|
final String edition,
|
||||||
final String conferenceplace,
|
final String conferenceplace,
|
||||||
final String conferencedate) {
|
final String conferencedate) {
|
||||||
|
|
||||||
if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) {
|
if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) {
|
||||||
final Journal j = new Journal();
|
final Journal j = new Journal();
|
||||||
j.setName(name);
|
j.setName(name);
|
||||||
j.setIssnPrinted(issnPrinted);
|
j.setIssnPrinted(issnPrinted);
|
||||||
j.setIssnOnline(issnOnline);
|
j.setIssnOnline(issnOnline);
|
||||||
j.setIssnLinking(issnLinking);
|
j.setIssnLinking(issnLinking);
|
||||||
j.setEp(ep);
|
j.setEp(ep);
|
||||||
j.setIss(iss);
|
j.setIss(iss);
|
||||||
j.setSp(sp);
|
j.setSp(sp);
|
||||||
j.setVol(vol);
|
j.setVol(vol);
|
||||||
j.setEdition(edition);
|
j.setEdition(edition);
|
||||||
j.setConferenceplace(conferenceplace);
|
j.setConferenceplace(conferenceplace);
|
||||||
j.setConferencedate(conferencedate);
|
j.setConferencedate(conferencedate);
|
||||||
return j;
|
return j;
|
||||||
} else {
|
} else {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) {
|
private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) {
|
||||||
return StringUtils.isNotBlank(issnPrinted)
|
return StringUtils.isNotBlank(issnPrinted)
|
||||||
|| StringUtils.isNotBlank(issnOnline)
|
|| StringUtils.isNotBlank(issnOnline)
|
||||||
|| StringUtils.isNotBlank(issnLinking);
|
|| StringUtils.isNotBlank(issnLinking);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static DataInfo dataInfo(
|
public static DataInfo dataInfo(
|
||||||
final float trust,
|
final float trust,
|
||||||
final String inferenceprovenance,
|
final String inferenceprovenance,
|
||||||
final boolean inferred,
|
final boolean inferred,
|
||||||
final Qualifier provenanceaction) {
|
final Qualifier provenanceaction) {
|
||||||
final DataInfo d = new DataInfo();
|
final DataInfo d = new DataInfo();
|
||||||
d.setTrust(trust);
|
d.setTrust(trust);
|
||||||
d.setInferenceprovenance(inferenceprovenance);
|
d.setInferenceprovenance(inferenceprovenance);
|
||||||
d.setInferred(inferred);
|
d.setInferred(inferred);
|
||||||
d.setProvenanceaction(provenanceaction);
|
d.setProvenanceaction(provenanceaction);
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static EntityDataInfo dataInfo(
|
public static EntityDataInfo dataInfo(
|
||||||
final boolean invisible,
|
final boolean invisible,
|
||||||
final boolean deletedbyinference,
|
final boolean deletedbyinference,
|
||||||
final float trust,
|
final float trust,
|
||||||
final String inferenceprovenance,
|
final String inferenceprovenance,
|
||||||
final boolean inferred,
|
final boolean inferred,
|
||||||
final Qualifier provenanceaction) {
|
final Qualifier provenanceaction) {
|
||||||
final EntityDataInfo d = new EntityDataInfo();
|
final EntityDataInfo d = new EntityDataInfo();
|
||||||
d.setTrust(trust);
|
d.setTrust(trust);
|
||||||
d.setInvisible(invisible);
|
d.setInvisible(invisible);
|
||||||
d.setDeletedbyinference(deletedbyinference);
|
d.setDeletedbyinference(deletedbyinference);
|
||||||
d.setInferenceprovenance(inferenceprovenance);
|
d.setInferenceprovenance(inferenceprovenance);
|
||||||
d.setInferred(inferred);
|
d.setInferred(inferred);
|
||||||
d.setProvenanceaction(provenanceaction);
|
d.setProvenanceaction(provenanceaction);
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String asString(final Object o) {
|
public static String asString(final Object o) {
|
||||||
return o == null ? "" : o.toString();
|
return o == null ? "" : o.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T> Predicate<T> distinctByKey(
|
public static <T> Predicate<T> distinctByKey(
|
||||||
final Function<? super T, ?> keyExtractor) {
|
final Function<? super T, ?> keyExtractor) {
|
||||||
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
|
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
|
||||||
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
|
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
|
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
|
||||||
return getBestAccessRights(instanceList);
|
return getBestAccessRights(instanceList);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
|
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
|
||||||
if (instanceList != null) {
|
if (instanceList != null) {
|
||||||
final Optional<AccessRight> min = instanceList
|
final Optional<AccessRight> min = instanceList
|
||||||
.stream()
|
.stream()
|
||||||
.map(Instance::getAccessright)
|
.map(Instance::getAccessright)
|
||||||
.min(new AccessRightComparator<>());
|
.min(new AccessRightComparator<>());
|
||||||
|
|
||||||
final Qualifier rights = min.map(OafMapperUtils::qualifier).orElseGet(Qualifier::new);
|
final Qualifier rights = min.map(OafMapperUtils::qualifier).orElseGet(Qualifier::new);
|
||||||
|
|
||||||
if (StringUtils.isBlank(rights.getClassid())) {
|
if (StringUtils.isBlank(rights.getClassid())) {
|
||||||
rights.setClassid(UNKNOWN);
|
rights.setClassid(UNKNOWN);
|
||||||
}
|
}
|
||||||
if (StringUtils.isBlank(rights.getClassname())
|
if (StringUtils.isBlank(rights.getClassname())
|
||||||
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
||||||
rights.setClassname(NOT_AVAILABLE);
|
rights.setClassname(NOT_AVAILABLE);
|
||||||
}
|
}
|
||||||
if (StringUtils.isBlank(rights.getSchemeid())) {
|
if (StringUtils.isBlank(rights.getSchemeid())) {
|
||||||
rights.setSchemeid(DNET_ACCESS_MODES);
|
rights.setSchemeid(DNET_ACCESS_MODES);
|
||||||
}
|
}
|
||||||
|
|
||||||
return rights;
|
return rights;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
|
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
|
||||||
Measure m = new Measure();
|
Measure m = new Measure();
|
||||||
m.setId(id);
|
m.setId(id);
|
||||||
m.setUnit(Arrays.asList(unit(key, value, dataInfo)));
|
m.setUnit(Arrays.asList(unit(key, value, dataInfo)));
|
||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static MeasureUnit unit(String key, String value, DataInfo dataInfo) {
|
public static MeasureUnit unit(String key, String value, DataInfo dataInfo) {
|
||||||
MeasureUnit unit = new MeasureUnit();
|
MeasureUnit unit = new MeasureUnit();
|
||||||
unit.setKey(key);
|
unit.setKey(key);
|
||||||
unit.setValue(value);
|
unit.setValue(value);
|
||||||
unit.setDataInfo(dataInfo);
|
unit.setDataInfo(dataInfo);
|
||||||
return unit;
|
return unit;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Relation getRelation(final String source,
|
public static Relation getRelation(final String source,
|
||||||
final String target,
|
final String target,
|
||||||
final String relType,
|
final Relation.RELTYPE relType,
|
||||||
final String subRelType,
|
final Relation.SUBRELTYPE subRelType,
|
||||||
final String relClass,
|
final Relation.RELCLASS relClass,
|
||||||
final Entity entity) {
|
final Entity entity) {
|
||||||
return getRelation(source, target, relType, subRelType, relClass, entity, null);
|
return getRelation(source, target, relType, subRelType, relClass, entity, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Relation getRelation(final String source,
|
public static Relation getRelation(final String source,
|
||||||
final String target,
|
final String target,
|
||||||
final String relType,
|
final Relation.RELTYPE relType,
|
||||||
final String subRelType,
|
final Relation.SUBRELTYPE subRelType,
|
||||||
final String relClass,
|
final Relation.RELCLASS relClass,
|
||||||
final Entity entity,
|
final Entity entity,
|
||||||
final String validationDate) {
|
final String validationDate) {
|
||||||
|
|
||||||
final List<Provenance> provenance = getProvenance(
|
final List<Provenance> provenance = getProvenance(
|
||||||
entity.getCollectedfrom(), fromEntityDataInfo(entity.getDataInfo()));
|
entity.getCollectedfrom(), fromEntityDataInfo(entity.getDataInfo()));
|
||||||
return getRelation(
|
return getRelation(
|
||||||
source, target, relType, subRelType, relClass, provenance, validationDate, null);
|
source, target, relType, subRelType, relClass, provenance, validationDate, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Relation getRelation(final String source,
|
public static Relation getRelation(final String source,
|
||||||
final String target,
|
final String target,
|
||||||
final String relType,
|
final Relation.RELTYPE relType,
|
||||||
final String subRelType,
|
final Relation.SUBRELTYPE subRelType,
|
||||||
final String relClass,
|
final Relation.RELCLASS relClass,
|
||||||
final List<Provenance> provenance) {
|
final List<Provenance> provenance) {
|
||||||
return getRelation(
|
return getRelation(
|
||||||
source, target, relType, subRelType, relClass, provenance, null, null);
|
source, target, relType, subRelType, relClass, provenance, null, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Relation getRelation(final String source,
|
public static Relation getRelation(final String source,
|
||||||
final String target,
|
final String target,
|
||||||
final String relType,
|
final Relation.RELTYPE relType,
|
||||||
final String subRelType,
|
final Relation.SUBRELTYPE subRelType,
|
||||||
final String relClass,
|
final Relation.RELCLASS relClass,
|
||||||
final List<Provenance> provenance,
|
final List<Provenance> provenance,
|
||||||
final List<KeyValue> properties) {
|
final List<KeyValue> properties) {
|
||||||
return getRelation(
|
return getRelation(
|
||||||
source, target, relType, subRelType, relClass, provenance, null, properties);
|
source, target, relType, subRelType, relClass, provenance, null, properties);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Relation getRelation(final String source,
|
public static Relation getRelation(final String source,
|
||||||
final String target,
|
final String target,
|
||||||
final String relType,
|
final Relation.RELTYPE relType,
|
||||||
final String subRelType,
|
final Relation.SUBRELTYPE subRelType,
|
||||||
final String relClass,
|
final Relation.RELCLASS relClass,
|
||||||
final List<Provenance> provenance,
|
final List<Provenance> provenance,
|
||||||
final String validationDate,
|
final String validationDate,
|
||||||
final List<KeyValue> properties) {
|
final List<KeyValue> properties) {
|
||||||
final Relation rel = new Relation();
|
final Relation rel = new Relation();
|
||||||
rel.setRelType(relType);
|
rel.setRelType(relType);
|
||||||
rel.setSubRelType(subRelType);
|
rel.setSubRelType(subRelType);
|
||||||
rel.setRelClass(relClass);
|
rel.setRelClass(relClass);
|
||||||
rel.setSource(source);
|
rel.setSource(source);
|
||||||
rel.setTarget(target);
|
rel.setTarget(target);
|
||||||
rel.setProvenance(provenance);
|
rel.setProvenance(provenance);
|
||||||
rel.setValidated(StringUtils.isNotBlank(validationDate));
|
rel.setValidated(StringUtils.isNotBlank(validationDate));
|
||||||
rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
|
rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
|
||||||
rel.setProperties(properties);
|
rel.setProperties(properties);
|
||||||
return rel;
|
return rel;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Provenance> getProvenance(final List<KeyValue> collectedfrom, final DataInfo dataInfo) {
|
public static List<Provenance> getProvenance(final List<KeyValue> collectedfrom, final DataInfo dataInfo) {
|
||||||
return collectedfrom
|
return collectedfrom
|
||||||
.stream()
|
.stream()
|
||||||
.map(cf -> getProvenance(cf, dataInfo))
|
.map(cf -> getProvenance(cf, dataInfo))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Provenance getProvenance(final KeyValue collectedfrom, final DataInfo dataInfo) {
|
public static Provenance getProvenance(final KeyValue collectedfrom, final DataInfo dataInfo) {
|
||||||
final Provenance prov = new Provenance();
|
final Provenance prov = new Provenance();
|
||||||
prov.setCollectedfrom(collectedfrom);
|
prov.setCollectedfrom(collectedfrom);
|
||||||
prov.setDataInfo(dataInfo);
|
prov.setDataInfo(dataInfo);
|
||||||
return prov;
|
return prov;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String getProvenance(DataInfo dataInfo) {
|
public static String getProvenance(DataInfo dataInfo) {
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(dataInfo)
|
.ofNullable(dataInfo)
|
||||||
.map(
|
.map(
|
||||||
d -> Optional
|
d -> Optional
|
||||||
.ofNullable(d.getProvenanceaction())
|
.ofNullable(d.getProvenanceaction())
|
||||||
.map(Qualifier::getClassid)
|
.map(Qualifier::getClassid)
|
||||||
.orElse(""))
|
.orElse(""))
|
||||||
.orElse("");
|
.orElse("");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static DataInfo fromEntityDataInfo(EntityDataInfo entityDataInfo) {
|
public static DataInfo fromEntityDataInfo(EntityDataInfo entityDataInfo) {
|
||||||
DataInfo dataInfo = new DataInfo();
|
DataInfo dataInfo = new DataInfo();
|
||||||
dataInfo.setTrust(entityDataInfo.getTrust());
|
dataInfo.setTrust(entityDataInfo.getTrust());
|
||||||
dataInfo.setInferenceprovenance(entityDataInfo.getInferenceprovenance());
|
dataInfo.setInferenceprovenance(entityDataInfo.getInferenceprovenance());
|
||||||
dataInfo.setInferred(entityDataInfo.getInferred());
|
dataInfo.setInferred(entityDataInfo.getInferred());
|
||||||
dataInfo.setProvenanceaction(entityDataInfo.getProvenanceaction());
|
dataInfo.setProvenanceaction(entityDataInfo.getProvenanceaction());
|
||||||
return dataInfo;
|
return dataInfo;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,34 +34,34 @@ public class ResultTypeComparator implements Comparator<Result> {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
String lClass = left.getResulttype();
|
Result.RESULTTYPE lType = left.getResulttype();
|
||||||
String rClass = right.getResulttype();
|
Result.RESULTTYPE rType = right.getResulttype();
|
||||||
|
|
||||||
if (lClass.equals(rClass))
|
if (lType.equals(rType))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
if (lType.equals(Result.RESULTTYPE.publication))
|
||||||
return -1;
|
return -1;
|
||||||
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
if (rType.equals(Result.RESULTTYPE.publication))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
if (lType.equals(Result.RESULTTYPE.dataset))
|
||||||
return -1;
|
return -1;
|
||||||
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
if (rType.equals(Result.RESULTTYPE.dataset))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
if (lType.equals(Result.RESULTTYPE.software))
|
||||||
return -1;
|
return -1;
|
||||||
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
if (rType.equals(Result.RESULTTYPE.software))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
if (lType.equals(Result.RESULTTYPE.otherresearchproduct))
|
||||||
return -1;
|
return -1;
|
||||||
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
if (rType.equals(Result.RESULTTYPE.otherresearchproduct))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
// Else (but unlikely), lexicographical ordering will do.
|
// Else (but unlikely), lexicographical ordering will do.
|
||||||
return lClass.compareTo(rClass);
|
return lType.compareTo(rType);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected HashSet<String> getCollectedFromIds(Result left) {
|
protected HashSet<String> getCollectedFromIds(Result left) {
|
||||||
|
|
|
@ -1,158 +0,0 @@
|
||||||
{
|
|
||||||
"cites":{
|
|
||||||
"original":"Cites",
|
|
||||||
"inverse":"IsCitedBy"
|
|
||||||
},
|
|
||||||
"compiles":{
|
|
||||||
"original":"Compiles",
|
|
||||||
"inverse":"IsCompiledBy"
|
|
||||||
},
|
|
||||||
"continues":{
|
|
||||||
"original":"Continues",
|
|
||||||
"inverse":"IsContinuedBy"
|
|
||||||
},
|
|
||||||
"derives":{
|
|
||||||
"original":"IsSourceOf",
|
|
||||||
"inverse":"IsDerivedFrom"
|
|
||||||
},
|
|
||||||
"describes":{
|
|
||||||
"original":"Describes",
|
|
||||||
"inverse":"IsDescribedBy"
|
|
||||||
},
|
|
||||||
"documents":{
|
|
||||||
"original":"Documents",
|
|
||||||
"inverse":"IsDocumentedBy"
|
|
||||||
},
|
|
||||||
"hasmetadata":{
|
|
||||||
"original":"HasMetadata",
|
|
||||||
"inverse":"IsMetadataOf"
|
|
||||||
},
|
|
||||||
"hasassociationwith":{
|
|
||||||
"original":"HasAssociationWith",
|
|
||||||
"inverse":"HasAssociationWith"
|
|
||||||
},
|
|
||||||
"haspart":{
|
|
||||||
"original":"HasPart",
|
|
||||||
"inverse":"IsPartOf"
|
|
||||||
},
|
|
||||||
"hasversion":{
|
|
||||||
"original":"HasVersion",
|
|
||||||
"inverse":"IsVersionOf"
|
|
||||||
},
|
|
||||||
"iscitedby":{
|
|
||||||
"original":"IsCitedBy",
|
|
||||||
"inverse":"Cites"
|
|
||||||
},
|
|
||||||
"iscompiledby":{
|
|
||||||
"original":"IsCompiledBy",
|
|
||||||
"inverse":"Compiles"
|
|
||||||
},
|
|
||||||
"iscontinuedby":{
|
|
||||||
"original":"IsContinuedBy",
|
|
||||||
"inverse":"Continues"
|
|
||||||
},
|
|
||||||
"isderivedfrom":{
|
|
||||||
"original":"IsDerivedFrom",
|
|
||||||
"inverse":"IsSourceOf"
|
|
||||||
},
|
|
||||||
"isdescribedby":{
|
|
||||||
"original":"IsDescribedBy",
|
|
||||||
"inverse":"Describes"
|
|
||||||
},
|
|
||||||
"isdocumentedby":{
|
|
||||||
"original":"IsDocumentedBy",
|
|
||||||
"inverse":"Documents"
|
|
||||||
},
|
|
||||||
"isidenticalto":{
|
|
||||||
"original":"IsIdenticalTo",
|
|
||||||
"inverse":"IsIdenticalTo"
|
|
||||||
},
|
|
||||||
"ismetadatafor":{
|
|
||||||
"original":"IsMetadataFor",
|
|
||||||
"inverse":"IsMetadataOf"
|
|
||||||
},
|
|
||||||
"ismetadataof":{
|
|
||||||
"original":"IsMetadataOf",
|
|
||||||
"inverse":"IsMetadataFor"
|
|
||||||
},
|
|
||||||
"isnewversionof":{
|
|
||||||
"original":"IsNewVersionOf",
|
|
||||||
"inverse":"IsPreviousVersionOf"
|
|
||||||
},
|
|
||||||
"isobsoletedby":{
|
|
||||||
"original":"IsObsoletedBy",
|
|
||||||
"inverse":"Obsoletes"
|
|
||||||
},
|
|
||||||
"isoriginalformof":{
|
|
||||||
"original":"IsOriginalFormOf",
|
|
||||||
"inverse":"IsVariantFormOf"
|
|
||||||
},
|
|
||||||
"ispartof":{
|
|
||||||
"original":"IsPartOf",
|
|
||||||
"inverse":"HasPart"
|
|
||||||
},
|
|
||||||
"ispreviousversionof":{
|
|
||||||
"original":"IsPreviousVersionOf",
|
|
||||||
"inverse":"IsNewVersionOf"
|
|
||||||
},
|
|
||||||
"isreferencedby":{
|
|
||||||
"original":"IsReferencedBy",
|
|
||||||
"inverse":"References"
|
|
||||||
},
|
|
||||||
"isrelatedto":{
|
|
||||||
"original":"IsRelatedTo",
|
|
||||||
"inverse":"IsRelatedTo"
|
|
||||||
},
|
|
||||||
"isrequiredby":{
|
|
||||||
"original":"IsRequiredBy",
|
|
||||||
"inverse":"Requires"
|
|
||||||
},
|
|
||||||
"isreviewedby":{
|
|
||||||
"original":"IsReviewedBy",
|
|
||||||
"inverse":"Reviews"
|
|
||||||
},
|
|
||||||
"issourceof":{
|
|
||||||
"original":"IsSourceOf",
|
|
||||||
"inverse":"IsDerivedFrom"
|
|
||||||
},
|
|
||||||
"issupplementedby":{
|
|
||||||
"original":"IsSupplementedBy",
|
|
||||||
"inverse":"IsSupplementTo"
|
|
||||||
},
|
|
||||||
"issupplementto":{
|
|
||||||
"original":"IsSupplementTo",
|
|
||||||
"inverse":"IsSupplementedBy"
|
|
||||||
},
|
|
||||||
"isvariantformof":{
|
|
||||||
"original":"IsVariantFormOf",
|
|
||||||
"inverse":"IsOriginalFormOf"
|
|
||||||
},
|
|
||||||
"isversionof":{
|
|
||||||
"original":"IsVersionOf",
|
|
||||||
"inverse":"HasVersion"
|
|
||||||
},
|
|
||||||
"obsoletes":{
|
|
||||||
"original":"Obsoletes",
|
|
||||||
"inverse":"IsObsoletedBy"
|
|
||||||
},
|
|
||||||
"references":{
|
|
||||||
"original":"References",
|
|
||||||
"inverse":"IsReferencedBy"
|
|
||||||
},
|
|
||||||
"requires":{
|
|
||||||
"original":"Requires",
|
|
||||||
"inverse":"IsRequiredBy"
|
|
||||||
},
|
|
||||||
"related":{
|
|
||||||
"original":"IsRelatedTo",
|
|
||||||
"inverse":"IsRelatedTo"
|
|
||||||
},
|
|
||||||
"reviews":{
|
|
||||||
"original":"Reviews",
|
|
||||||
"inverse":"IsReviewedBy"
|
|
||||||
},
|
|
||||||
"unknown":{
|
|
||||||
"original":"Unknown",
|
|
||||||
"inverse":"Unknown"
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -22,19 +22,6 @@ object ScholixUtils extends Serializable {
|
||||||
|
|
||||||
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
|
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
|
||||||
|
|
||||||
val relations: Map[String, RelationVocabulary] = {
|
|
||||||
val input = Source
|
|
||||||
.fromInputStream(
|
|
||||||
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
|
|
||||||
)
|
|
||||||
.mkString
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
||||||
|
|
||||||
lazy val json: json4s.JValue = parse(input)
|
|
||||||
|
|
||||||
json.extract[Map[String, RelationVocabulary]]
|
|
||||||
}
|
|
||||||
|
|
||||||
def extractRelationDate(relation: Relation): String = {
|
def extractRelationDate(relation: Relation): String = {
|
||||||
|
|
||||||
if (relation.getProperties == null || !relation.getProperties.isEmpty)
|
if (relation.getProperties == null || !relation.getProperties.isEmpty)
|
||||||
|
@ -288,11 +275,8 @@ object ScholixUtils extends Serializable {
|
||||||
s.setPublisher(source.getPublisher)
|
s.setPublisher(source.getPublisher)
|
||||||
}
|
}
|
||||||
|
|
||||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
|
||||||
if (semanticRelation == null)
|
|
||||||
return null
|
|
||||||
s.setRelationship(
|
s.setRelationship(
|
||||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
new ScholixRelationship(relation.getRelClass.toString, "datacite", relation.getRelClass.getInverse.toString)
|
||||||
)
|
)
|
||||||
s.setSource(source)
|
s.setSource(source)
|
||||||
|
|
||||||
|
@ -330,12 +314,10 @@ object ScholixUtils extends Serializable {
|
||||||
s.setPublisher(l.asJava)
|
s.setPublisher(l.asJava)
|
||||||
}
|
}
|
||||||
|
|
||||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
|
||||||
if (semanticRelation == null)
|
|
||||||
return null
|
|
||||||
s.setRelationship(
|
s.setRelationship(
|
||||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
new ScholixRelationship(relation.getRelClass.toString, "datacite", relation.getRelClass.getInverse.toString)
|
||||||
)
|
)
|
||||||
|
|
||||||
s.setSource(generateScholixResourceFromSummary(source))
|
s.setSource(generateScholixResourceFromSummary(source))
|
||||||
|
|
||||||
s
|
s
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class MergeUtilsTest {
|
||||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||||
|
|
||||||
final Result p1d2 = MergeUtils.merge(p1, d2);
|
final Result p1d2 = MergeUtils.merge(p1, d2);
|
||||||
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype());
|
assertEquals(Result.RESULTTYPE.publication, p1d2.getResulttype());
|
||||||
assertTrue(p1d2 instanceof Publication);
|
assertTrue(p1d2 instanceof Publication);
|
||||||
assertEquals(p1.getId(), p1d2.getId());
|
assertEquals(p1.getId(), p1d2.getId());
|
||||||
}
|
}
|
||||||
|
@ -55,7 +55,7 @@ public class MergeUtilsTest {
|
||||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
Dataset d1 = read("dataset_1.json", Dataset.class);
|
||||||
|
|
||||||
final Result p2d1 = MergeUtils.merge(p2, d1);
|
final Result p2d1 = MergeUtils.merge(p2, d1);
|
||||||
assertEquals(ModelConstants.DATASET_RESULTTYPE_CLASSID, p2d1.getResulttype());
|
assertEquals(Result.RESULTTYPE.dataset, p2d1.getResulttype());
|
||||||
assertTrue(p2d1 instanceof Dataset);
|
assertTrue(p2d1 instanceof Dataset);
|
||||||
assertEquals(d1.getId(), p2d1.getId());
|
assertEquals(d1.getId(), p2d1.getId());
|
||||||
assertEquals(2, p2d1.getCollectedfrom().size());
|
assertEquals(2, p2d1.getCollectedfrom().size());
|
||||||
|
|
|
@ -18,30 +18,21 @@ object CollectionUtils {
|
||||||
|
|
||||||
def fixRelations(i: Oaf): List[Oaf] = {
|
def fixRelations(i: Oaf): List[Oaf] = {
|
||||||
if (i.isInstanceOf[Entity])
|
if (i.isInstanceOf[Entity])
|
||||||
return List(i)
|
List(i)
|
||||||
else {
|
else {
|
||||||
val r: Relation = i.asInstanceOf[Relation]
|
val r: Relation = i.asInstanceOf[Relation]
|
||||||
val currentRel = ModelSupport.findRelation(r.getRelClass)
|
val inverse = new Relation
|
||||||
if (currentRel != null) {
|
inverse.setSource(r.getTarget)
|
||||||
|
inverse.setTarget(r.getSource)
|
||||||
// Cleaning relation
|
inverse.setRelType(r.getRelType)
|
||||||
r.setRelType(currentRel.getRelType)
|
inverse.setSubRelType(r.getSubRelType)
|
||||||
r.setSubRelType(currentRel.getSubReltype)
|
inverse.setRelClass(r.getRelClass.getInverse)
|
||||||
r.setRelClass(currentRel.getRelClass)
|
inverse.setProvenance(r.getProvenance)
|
||||||
val inverse = new Relation
|
inverse.setProperties(r.getProperties)
|
||||||
inverse.setSource(r.getTarget)
|
inverse.setValidated(r.getValidated)
|
||||||
inverse.setTarget(r.getSource)
|
inverse.setValidationDate(r.getValidationDate)
|
||||||
inverse.setRelType(currentRel.getRelType)
|
List(r, inverse)
|
||||||
inverse.setSubRelType(currentRel.getSubReltype)
|
|
||||||
inverse.setRelClass(currentRel.getInverseRelClass)
|
|
||||||
inverse.setProvenance(r.getProvenance)
|
|
||||||
inverse.setProperties(r.getProperties)
|
|
||||||
inverse.setValidated(r.getValidated)
|
|
||||||
inverse.setValidationDate(r.getValidationDate)
|
|
||||||
return List(r, inverse)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
List()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = {
|
def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = {
|
||||||
|
|
|
@ -61,13 +61,15 @@ object CrossrefUtility {
|
||||||
resultList
|
resultList
|
||||||
}
|
}
|
||||||
|
|
||||||
private def createRelation(sourceId: String, targetId: String, relClass: String): Relation = {
|
private def createRelation(sourceId: String, targetId: String, relClass: Relation.RELCLASS): Relation = {
|
||||||
val r = new Relation
|
val r = new Relation
|
||||||
|
|
||||||
|
//TODO further inspect
|
||||||
r.setSource(sourceId)
|
r.setSource(sourceId)
|
||||||
r.setTarget(targetId)
|
r.setTarget(targetId)
|
||||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
r.setRelType(Relation.RELTYPE.resultProject)
|
||||||
r.setRelClass(relClass)
|
r.setRelClass(relClass)
|
||||||
r.setSubRelType(ModelConstants.OUTCOME)
|
r.setSubRelType(Relation.SUBRELTYPE.outcome)
|
||||||
r.setProvenance(List(OafMapperUtils.getProvenance(CROSSREF_COLLECTED_FROM, null)).asJava)
|
r.setProvenance(List(OafMapperUtils.getProvenance(CROSSREF_COLLECTED_FROM, null)).asJava)
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
|
@ -84,7 +86,7 @@ object CrossrefUtility {
|
||||||
.filter(a => a != null && a.nonEmpty)
|
.filter(a => a != null && a.nonEmpty)
|
||||||
.map(award => {
|
.map(award => {
|
||||||
val targetId = IdentifierFactory.createOpenaireId("project", s"$nsPrefix::$award", true)
|
val targetId = IdentifierFactory.createOpenaireId("project", s"$nsPrefix::$award", true)
|
||||||
createRelation(targetId, source.getId, ModelConstants.PRODUCES)
|
createRelation(targetId, source.getId, Relation.RELCLASS.produces)
|
||||||
})
|
})
|
||||||
else List()
|
else List()
|
||||||
}
|
}
|
||||||
|
@ -132,15 +134,15 @@ object CrossrefUtility {
|
||||||
case "10.13039/501100000038" =>
|
case "10.13039/501100000038" =>
|
||||||
val targetId =
|
val targetId =
|
||||||
IdentifierFactory.createOpenaireId("project", "nserc_______::1e5e62235d094afd01cd56e65112fc63", false)
|
IdentifierFactory.createOpenaireId("project", "nserc_______::1e5e62235d094afd01cd56e65112fc63", false)
|
||||||
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
|
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
|
||||||
case "10.13039/501100000155" =>
|
case "10.13039/501100000155" =>
|
||||||
val targetId =
|
val targetId =
|
||||||
IdentifierFactory.createOpenaireId("project", "sshrc_______::1e5e62235d094afd01cd56e65112fc63", false)
|
IdentifierFactory.createOpenaireId("project", "sshrc_______::1e5e62235d094afd01cd56e65112fc63", false)
|
||||||
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
|
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
|
||||||
case "10.13039/501100000024" =>
|
case "10.13039/501100000024" =>
|
||||||
val targetId =
|
val targetId =
|
||||||
IdentifierFactory.createOpenaireId("project", "cihr________::1e5e62235d094afd01cd56e65112fc63", false)
|
IdentifierFactory.createOpenaireId("project", "cihr________::1e5e62235d094afd01cd56e65112fc63", false)
|
||||||
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
|
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
|
||||||
case "10.13039/501100002848" =>
|
case "10.13039/501100002848" =>
|
||||||
relList = relList ::: generateSimpleRelationFromAward(funder, "conicytf____", a => a, result)
|
relList = relList ::: generateSimpleRelationFromAward(funder, "conicytf____", a => a, result)
|
||||||
case "10.13039/501100003448" =>
|
case "10.13039/501100003448" =>
|
||||||
|
@ -153,7 +155,7 @@ object CrossrefUtility {
|
||||||
relList = relList ::: generateSimpleRelationFromAward(funder, "miur________", a => a, result)
|
relList = relList ::: generateSimpleRelationFromAward(funder, "miur________", a => a, result)
|
||||||
val targetId =
|
val targetId =
|
||||||
IdentifierFactory.createOpenaireId("project", "miur________::1e5e62235d094afd01cd56e65112fc63", false)
|
IdentifierFactory.createOpenaireId("project", "miur________::1e5e62235d094afd01cd56e65112fc63", false)
|
||||||
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
|
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
|
||||||
case "10.13039/501100006588" | "10.13039/501100004488" =>
|
case "10.13039/501100006588" | "10.13039/501100004488" =>
|
||||||
relList = relList ::: generateSimpleRelationFromAward(
|
relList = relList ::: generateSimpleRelationFromAward(
|
||||||
funder,
|
funder,
|
||||||
|
@ -171,7 +173,7 @@ object CrossrefUtility {
|
||||||
relList = relList ::: generateSimpleRelationFromAward(funder, "wt__________", a => a, result)
|
relList = relList ::: generateSimpleRelationFromAward(funder, "wt__________", a => a, result)
|
||||||
val targetId =
|
val targetId =
|
||||||
IdentifierFactory.createOpenaireId("project", "wt__________::1e5e62235d094afd01cd56e65112fc63", false)
|
IdentifierFactory.createOpenaireId("project", "wt__________::1e5e62235d094afd01cd56e65112fc63", false)
|
||||||
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
|
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
|
||||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -191,7 +193,7 @@ object CrossrefUtility {
|
||||||
relList = relList ::: generateSimpleRelationFromAward(funder, "wt__________", a => a, result)
|
relList = relList ::: generateSimpleRelationFromAward(funder, "wt__________", a => a, result)
|
||||||
val targetId =
|
val targetId =
|
||||||
IdentifierFactory.createOpenaireId("project", "wt__________::1e5e62235d094afd01cd56e65112fc63", false)
|
IdentifierFactory.createOpenaireId("project", "wt__________::1e5e62235d094afd01cd56e65112fc63", false)
|
||||||
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
|
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
|
||||||
case _ => logger.debug("no match for " + funder.name)
|
case _ => logger.debug("no match for " + funder.name)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -78,124 +78,6 @@ object DataciteModelConstants {
|
||||||
val DATACITE_COLLECTED_FROM: KeyValue =
|
val DATACITE_COLLECTED_FROM: KeyValue =
|
||||||
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
||||||
|
|
||||||
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
|
||||||
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
|
||||||
ModelConstants.IS_SUPPLEMENT_TO,
|
|
||||||
ModelConstants.SUPPLEMENT
|
|
||||||
),
|
|
||||||
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
|
|
||||||
ModelConstants.IS_SUPPLEMENT_TO,
|
|
||||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
|
||||||
ModelConstants.SUPPLEMENT
|
|
||||||
),
|
|
||||||
ModelConstants.HAS_PART -> OAFRelations(
|
|
||||||
ModelConstants.HAS_PART,
|
|
||||||
ModelConstants.IS_PART_OF,
|
|
||||||
ModelConstants.PART
|
|
||||||
),
|
|
||||||
ModelConstants.IS_PART_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_PART_OF,
|
|
||||||
ModelConstants.HAS_PART,
|
|
||||||
ModelConstants.PART
|
|
||||||
),
|
|
||||||
ModelConstants.IS_VERSION_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_VERSION_OF,
|
|
||||||
ModelConstants.HAS_VERSION,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.HAS_VERSION -> OAFRelations(
|
|
||||||
ModelConstants.HAS_VERSION,
|
|
||||||
ModelConstants.IS_VERSION_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
|
|
||||||
ModelConstants.IS_IDENTICAL_TO,
|
|
||||||
ModelConstants.IS_IDENTICAL_TO,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_CONTINUED_BY,
|
|
||||||
ModelConstants.CONTINUES,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.CONTINUES -> OAFRelations(
|
|
||||||
ModelConstants.CONTINUES,
|
|
||||||
ModelConstants.IS_CONTINUED_BY,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_NEW_VERSION_OF,
|
|
||||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
|
||||||
ModelConstants.IS_NEW_VERSION_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY,
|
|
||||||
ModelConstants.DOCUMENTS,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
|
||||||
ModelConstants.DOCUMENTS,
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_SOURCE_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_SOURCE_OF,
|
|
||||||
ModelConstants.IS_DERIVED_FROM,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
|
|
||||||
ModelConstants.IS_DERIVED_FROM,
|
|
||||||
ModelConstants.IS_SOURCE_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_VARIANT_FORM_OF,
|
|
||||||
ModelConstants.IS_DERIVED_FROM,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_OBSOLETED_BY,
|
|
||||||
ModelConstants.IS_NEW_VERSION_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.REVIEWS -> OAFRelations(
|
|
||||||
ModelConstants.REVIEWS,
|
|
||||||
ModelConstants.IS_REVIEWED_BY,
|
|
||||||
ModelConstants.REVIEW
|
|
||||||
),
|
|
||||||
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_REVIEWED_BY,
|
|
||||||
ModelConstants.REVIEWS,
|
|
||||||
ModelConstants.REVIEW
|
|
||||||
),
|
|
||||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
|
||||||
ModelConstants.DOCUMENTS,
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY,
|
|
||||||
ModelConstants.DOCUMENTS,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.COMPILES -> OAFRelations(
|
|
||||||
ModelConstants.COMPILES,
|
|
||||||
ModelConstants.IS_COMPILED_BY,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_COMPILED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_COMPILED_BY,
|
|
||||||
ModelConstants.COMPILES,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
val datacite_filter: List[String] = {
|
val datacite_filter: List[String] = {
|
||||||
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
||||||
require(stream != null)
|
require(stream != null)
|
||||||
|
|
|
@ -286,7 +286,7 @@ object DataciteToOAFTransformation {
|
||||||
def generateRelation(
|
def generateRelation(
|
||||||
sourceId: String,
|
sourceId: String,
|
||||||
targetId: String,
|
targetId: String,
|
||||||
relClass: String,
|
relClass: Relation.RELCLASS,
|
||||||
collectedFrom: KeyValue,
|
collectedFrom: KeyValue,
|
||||||
di: DataInfo
|
di: DataInfo
|
||||||
): Relation = {
|
): Relation = {
|
||||||
|
@ -294,9 +294,9 @@ object DataciteToOAFTransformation {
|
||||||
val r = new Relation
|
val r = new Relation
|
||||||
r.setSource(sourceId)
|
r.setSource(sourceId)
|
||||||
r.setTarget(targetId)
|
r.setTarget(targetId)
|
||||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
r.setRelType(Relation.RELTYPE.resultProject)
|
||||||
r.setRelClass(relClass)
|
r.setRelClass(relClass)
|
||||||
r.setSubRelType(ModelConstants.OUTCOME)
|
r.setSubRelType(Relation.SUBRELTYPE.outcome)
|
||||||
r.setProvenance(Lists.newArrayList(OafMapperUtils.getProvenance(collectedFrom, di)))
|
r.setProvenance(Lists.newArrayList(OafMapperUtils.getProvenance(collectedFrom, di)))
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
|
@ -309,7 +309,7 @@ object DataciteToOAFTransformation {
|
||||||
val p = match_pattern.get._2
|
val p = match_pattern.get._2
|
||||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||||
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, relDataInfo))
|
List(generateRelation(sourceId, targetId, Relation.RELCLASS.isProducedBy, DATACITE_COLLECTED_FROM, relDataInfo))
|
||||||
} else
|
} else
|
||||||
List()
|
List()
|
||||||
|
|
||||||
|
@ -622,8 +622,7 @@ object DataciteToOAFTransformation {
|
||||||
): List[Relation] = {
|
): List[Relation] = {
|
||||||
val bidirectionalRels: List[Relation] = rels
|
val bidirectionalRels: List[Relation] = rels
|
||||||
.filter(r =>
|
.filter(r =>
|
||||||
subRelTypeMapping
|
Relation.RELCLASS.exists(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||||
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
|
||||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,122 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class CleanContextSparkJob implements Serializable {
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanContextSparkJob.class);
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
|
|
||||||
String jsonConfiguration = IOUtils
|
|
||||||
.toString(
|
|
||||||
CleanContextSparkJob.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_clean_context_parameters.json"));
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
|
||||||
log.info("inputPath: {}", inputPath);
|
|
||||||
|
|
||||||
String workingDir = parser.get("workingDir");
|
|
||||||
log.info("workingDir: {}", workingDir);
|
|
||||||
|
|
||||||
String contextId = parser.get("contextId");
|
|
||||||
log.info("contextId: {}", contextId);
|
|
||||||
|
|
||||||
String verifyParam = parser.get("verifyParam");
|
|
||||||
log.info("verifyParam: {}", verifyParam);
|
|
||||||
|
|
||||||
String graphTableClassName = parser.get("graphTableClassName");
|
|
||||||
log.info("graphTableClassName: {}", graphTableClassName);
|
|
||||||
|
|
||||||
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
|
|
||||||
cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingDir);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> void cleanContext(SparkSession spark, String contextId, String verifyParam,
|
|
||||||
String inputPath, Class<T> entityClazz, String workingDir) {
|
|
||||||
Dataset<T> res = spark
|
|
||||||
.read()
|
|
||||||
.textFile(inputPath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
|
||||||
Encoders.bean(entityClazz));
|
|
||||||
|
|
||||||
res.map((MapFunction<T, T>) r -> {
|
|
||||||
if (!r
|
|
||||||
.getTitle()
|
|
||||||
.stream()
|
|
||||||
.filter(
|
|
||||||
t -> t
|
|
||||||
.getQualifier()
|
|
||||||
.getClassid()
|
|
||||||
.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
|
|
||||||
.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()))) {
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
r
|
|
||||||
.setContext(
|
|
||||||
r
|
|
||||||
.getContext()
|
|
||||||
.stream()
|
|
||||||
.filter(
|
|
||||||
c -> !c.getId().split("::")[0]
|
|
||||||
.equalsIgnoreCase(contextId))
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
return r;
|
|
||||||
}, Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir);
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(workingDir)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
|
||||||
Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(inputPath);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -3,12 +3,20 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.*;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import com.google.common.collect.Iterators;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.common.ModelSupport;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
@ -20,12 +28,16 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Entity;
|
import eu.dnetlib.dhp.schema.oaf.Entity;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class CleanGraphSparkJob {
|
public class CleanGraphSparkJob {
|
||||||
|
|
||||||
|
@ -33,79 +45,293 @@ public class CleanGraphSparkJob {
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
private ArgumentApplicationParser parser;
|
||||||
|
|
||||||
|
public CleanGraphSparkJob(ArgumentApplicationParser parser) {
|
||||||
|
this.parser = parser;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
String jsonConfiguration = IOUtils
|
String jsonConfiguration = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
CleanGraphSparkJob.class
|
CleanGraphSparkJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
Boolean isSparkSessionManaged = Optional
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
.map(Boolean::valueOf)
|
.map(Boolean::valueOf)
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String isLookupUrl = parser.get("isLookupUrl");
|
||||||
|
log.info("isLookupUrl: {}", isLookupUrl);
|
||||||
|
|
||||||
|
ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(parser).run(isSparkSessionManaged, isLookup);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run(Boolean isSparkSessionManaged, ISLookUpService isLookUpService)
|
||||||
|
throws ISLookUpException, ClassNotFoundException {
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
String inputPath = parser.get("inputPath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
String outputPath = parser.get("outputPath");
|
String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
String isLookupUrl = parser.get("isLookupUrl");
|
|
||||||
log.info("isLookupUrl: {}", isLookupUrl);
|
|
||||||
|
|
||||||
String graphTableClassName = parser.get("graphTableClassName");
|
String graphTableClassName = parser.get("graphTableClassName");
|
||||||
log.info("graphTableClassName: {}", graphTableClassName);
|
log.info("graphTableClassName: {}", graphTableClassName);
|
||||||
|
|
||||||
|
String contextId = parser.get("contextId");
|
||||||
|
log.info("contextId: {}", contextId);
|
||||||
|
|
||||||
|
String verifyParam = parser.get("verifyParam");
|
||||||
|
log.info("verifyParam: {}", verifyParam);
|
||||||
|
|
||||||
|
String datasourcePath = parser.get("hostedBy");
|
||||||
|
log.info("datasourcePath: {}", datasourcePath);
|
||||||
|
|
||||||
|
String country = parser.get("country");
|
||||||
|
log.info("country: {}", country);
|
||||||
|
|
||||||
|
String[] verifyCountryParam = Optional
|
||||||
|
.ofNullable(parser.get("verifyCountryParam"))
|
||||||
|
.map(s -> s.split(";"))
|
||||||
|
.orElse(new String[] {});
|
||||||
|
log.info("verifyCountryParam: {}", verifyCountryParam);
|
||||||
|
|
||||||
|
String collectedfrom = parser.get("collectedfrom");
|
||||||
|
log.info("collectedfrom: {}", collectedfrom);
|
||||||
|
|
||||||
|
String dsMasterDuplicatePath = parser.get("masterDuplicatePath");
|
||||||
|
log.info("masterDuplicatePath: {}", dsMasterDuplicatePath);
|
||||||
|
|
||||||
|
Boolean deepClean = Optional
|
||||||
|
.ofNullable(parser.get("deepClean"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.FALSE);
|
||||||
|
log.info("deepClean: {}", deepClean);
|
||||||
|
|
||||||
Class<? extends Entity> entityClazz = (Class<? extends Entity>) Class.forName(graphTableClassName);
|
Class<? extends Entity> entityClazz = (Class<? extends Entity>) Class.forName(graphTableClassName);
|
||||||
|
|
||||||
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||||
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setAppName(CleanGraphSparkJob.class.getSimpleName() + "#" + entityClazz.getSimpleName());
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
||||||
cleanGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
|
cleanGraphTable(
|
||||||
});
|
spark, vocs, inputPath, entityClazz, outputPath, contextId, verifyParam, datasourcePath, country,
|
||||||
|
verifyCountryParam, collectedfrom, dsMasterDuplicatePath, deepClean);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends Oaf> void cleanGraphTable(
|
private static <T extends Oaf> void cleanGraphTable(
|
||||||
SparkSession spark,
|
SparkSession spark,
|
||||||
VocabularyGroup vocs,
|
VocabularyGroup vocs,
|
||||||
String inputPath,
|
String inputPath,
|
||||||
Class<T> clazz,
|
Class<T> clazz,
|
||||||
String outputPath) {
|
String outputPath, String contextId, String verifyParam, String datasourcePath, String country,
|
||||||
|
String[] verifyCountryParam, String collectedfrom, String dsMasterDuplicatePath,
|
||||||
|
Boolean deepClean) {
|
||||||
|
|
||||||
final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
|
final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
|
||||||
|
|
||||||
readTableFromPath(spark, inputPath, clazz)
|
final Dataset<T> cleaned_basic = readTableFromPath(spark, inputPath, clazz)
|
||||||
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
|
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
|
||||||
.filter((FilterFunction<T>) GraphCleaningFunctions::filter)
|
.filter((FilterFunction<T>) GraphCleaningFunctions::filter);
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
// read the master-duplicate tuples
|
||||||
.option("compression", "gzip")
|
Dataset<MasterDuplicate> md = spark
|
||||||
.json(outputPath);
|
.read()
|
||||||
|
.textFile(dsMasterDuplicatePath)
|
||||||
|
.map(as(MasterDuplicate.class), Encoders.bean(MasterDuplicate.class));
|
||||||
|
|
||||||
|
// prepare the resolved CF|HB references with the corresponding EMPTY master ID
|
||||||
|
Dataset<IdCfHbMapping> resolved = spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map(as(clazz), Encoders.bean(clazz))
|
||||||
|
.flatMap(flattenCfHbFn(), Encoders.bean(IdCfHbMapping.class));
|
||||||
|
|
||||||
|
if (Boolean.FALSE.equals(deepClean)) {
|
||||||
|
|
||||||
|
if (Boolean.TRUE.equals(ModelSupport.isSubClass(clazz, Result.class))) {
|
||||||
|
save(fixCFHB(clazz, cleaned_basic, md, resolved), outputPath);
|
||||||
|
} else {
|
||||||
|
save(cleaned_basic, outputPath);
|
||||||
|
}
|
||||||
|
} else if (Boolean.TRUE.equals(ModelSupport.isSubClass(clazz, Result.class))) {
|
||||||
|
|
||||||
|
// load the hostedby mapping
|
||||||
|
Set<String> hostedBy = Sets
|
||||||
|
.newHashSet(
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.textFile(datasourcePath)
|
||||||
|
.collectAsList());
|
||||||
|
|
||||||
|
// perform the deep cleaning steps
|
||||||
|
final Dataset<T> cleaned_deep = fixCFHB(clazz, cleaned_basic, md, resolved)
|
||||||
|
.map(
|
||||||
|
(MapFunction<T, T>) value -> GraphCleaningFunctions.cleanContext(value, contextId, verifyParam),
|
||||||
|
Encoders.bean(clazz))
|
||||||
|
.map(
|
||||||
|
(MapFunction<T, T>) value -> GraphCleaningFunctions
|
||||||
|
.cleanCountry(value, verifyCountryParam, hostedBy, collectedfrom, country),
|
||||||
|
Encoders.bean(clazz));
|
||||||
|
|
||||||
|
save(cleaned_deep, outputPath);
|
||||||
|
} else {
|
||||||
|
save(cleaned_basic, outputPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> void save(final Dataset<T> dataset, final String outputPath) {
|
||||||
|
dataset
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> Dataset<T> fixCFHB(Class<T> clazz, Dataset<T> results, Dataset<MasterDuplicate> md,
|
||||||
|
Dataset<IdCfHbMapping> resolved) {
|
||||||
|
|
||||||
|
// set the EMPTY master ID/NAME
|
||||||
|
Dataset<IdCfHbMapping> resolvedDs = resolved
|
||||||
|
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicateId")))
|
||||||
|
.map(asIdCfHbMapping(), Encoders.bean(IdCfHbMapping.class))
|
||||||
|
.filter((FilterFunction<IdCfHbMapping>) m -> Objects.nonNull(m.getMasterId()));
|
||||||
|
|
||||||
|
return results
|
||||||
|
.joinWith(resolvedDs, results.col("id").equalTo(resolvedDs.col("resultId")), "left")
|
||||||
|
.groupByKey(
|
||||||
|
(MapFunction<Tuple2<T, IdCfHbMapping>, String>) t -> ((Result) t._1()).getId(), Encoders.STRING())
|
||||||
|
.mapGroups(getMapGroupsFunction(), Encoders.bean(clazz));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends Oaf> Dataset<T> readTableFromPath(
|
private static <T extends Oaf> Dataset<T> readTableFromPath(
|
||||||
SparkSession spark, String inputEntityPath, Class<T> clazz) {
|
SparkSession spark, String inputEntityPath, Class<T> clazz) {
|
||||||
|
|
||||||
log.info("Reading Graph table from: {}", inputEntityPath);
|
log.info("Reading Graph table from: {}", inputEntityPath);
|
||||||
return spark
|
return spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(inputEntityPath)
|
.textFile(inputEntityPath)
|
||||||
.map(
|
.map(as(clazz), Encoders.bean(clazz));
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz),
|
}
|
||||||
Encoders.bean(clazz));
|
|
||||||
|
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
||||||
|
return s -> OBJECT_MAPPER.readValue(s, clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> FlatMapFunction<T, IdCfHbMapping> flattenCfHbFn() {
|
||||||
|
return t -> {
|
||||||
|
if (!ModelSupport.isSubClass(t, Result.class)) {
|
||||||
|
return Iterators.emptyIterator();
|
||||||
|
}
|
||||||
|
final Result r = (Result) t;
|
||||||
|
return Stream
|
||||||
|
.concat(
|
||||||
|
Optional
|
||||||
|
.ofNullable(r.getCollectedfrom())
|
||||||
|
.map(cf -> cf.stream().map(KeyValue::getKey))
|
||||||
|
.orElse(Stream.empty()),
|
||||||
|
Stream
|
||||||
|
.concat(
|
||||||
|
Optional
|
||||||
|
.ofNullable(r.getInstance())
|
||||||
|
.map(
|
||||||
|
instances -> instances
|
||||||
|
.stream()
|
||||||
|
.map(i -> Optional.ofNullable(i.getHostedby()).map(KeyValue::getKey).orElse("")))
|
||||||
|
.orElse(Stream.empty())
|
||||||
|
.filter(StringUtils::isNotBlank),
|
||||||
|
Optional
|
||||||
|
.ofNullable(r.getInstance())
|
||||||
|
.map(
|
||||||
|
instances -> instances
|
||||||
|
.stream()
|
||||||
|
.map(
|
||||||
|
i -> Optional
|
||||||
|
.ofNullable(i.getCollectedfrom())
|
||||||
|
.map(KeyValue::getKey)
|
||||||
|
.orElse("")))
|
||||||
|
.orElse(Stream.empty())
|
||||||
|
.filter(StringUtils::isNotBlank)))
|
||||||
|
.distinct()
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.map(cfHb -> asIdCfHbMapping(r.getId(), cfHb))
|
||||||
|
.iterator();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping> asIdCfHbMapping() {
|
||||||
|
return t -> {
|
||||||
|
final IdCfHbMapping mapping = t._1();
|
||||||
|
Optional
|
||||||
|
.ofNullable(t._2())
|
||||||
|
.ifPresent(t2 -> {
|
||||||
|
mapping.setMasterId(t2.getMasterId());
|
||||||
|
mapping.setMasterName(t2.getMasterName());
|
||||||
|
|
||||||
|
});
|
||||||
|
return mapping;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IdCfHbMapping asIdCfHbMapping(String resultId, String cfHb) {
|
||||||
|
IdCfHbMapping m = new IdCfHbMapping(resultId);
|
||||||
|
m.setCfhb(cfHb);
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T> getMapGroupsFunction() {
|
||||||
|
return new MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T>() {
|
||||||
|
@Override
|
||||||
|
public T call(String key, Iterator<Tuple2<T, IdCfHbMapping>> values) {
|
||||||
|
final Tuple2<T, IdCfHbMapping> first = values.next();
|
||||||
|
final T res = first._1();
|
||||||
|
|
||||||
|
updateResult(res, first._2());
|
||||||
|
values.forEachRemaining(t -> updateResult(res, t._2()));
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateResult(T t, IdCfHbMapping m) {
|
||||||
|
if (Objects.nonNull(m) && (ModelSupport.isSubClass(t, Result.class))) {
|
||||||
|
final Result res = (Result) t;
|
||||||
|
filter(res.getCollectedfrom()).forEach(kv -> updateKeyValue(kv, m));
|
||||||
|
res.getInstance().forEach(i -> {
|
||||||
|
updateKeyValue(i.getHostedby(), m);
|
||||||
|
updateKeyValue(i.getCollectedfrom(), m);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Stream<KeyValue> filter(List<KeyValue> kvs) {
|
||||||
|
return kvs
|
||||||
|
.stream()
|
||||||
|
.filter(kv -> StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
|
||||||
|
if (Objects.nonNull(kv) && Objects.nonNull(kv.getKey()) && kv.getKey().equals(a.getCfhb())) {
|
||||||
|
kv.setKey(a.getMasterId());
|
||||||
|
kv.setValue(a.getMasterName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.country;
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -21,7 +20,6 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
|
@ -1,227 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
|
||||||
import org.apache.spark.sql.*;
|
|
||||||
import org.apache.spark.sql.expressions.Aggregator;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
|
||||||
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
public class CleanCfHbSparkJob {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJob.class);
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
|
|
||||||
String jsonConfiguration = IOUtils
|
|
||||||
.toString(
|
|
||||||
CleanCountrySparkJob.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_clean_cfhb_parameters.json"));
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
|
||||||
log.info("inputPath: {}", inputPath);
|
|
||||||
|
|
||||||
String resolvedPath = parser.get("resolvedPath");
|
|
||||||
log.info("resolvedPath: {}", resolvedPath);
|
|
||||||
|
|
||||||
String outputPath = parser.get("outputPath");
|
|
||||||
log.info("outputPath: {}", outputPath);
|
|
||||||
|
|
||||||
String dsMasterDuplicatePath = parser.get("masterDuplicatePath");
|
|
||||||
log.info("masterDuplicatePath: {}", dsMasterDuplicatePath);
|
|
||||||
|
|
||||||
String graphTableClassName = parser.get("graphTableClassName");
|
|
||||||
log.info("graphTableClassName: {}", graphTableClassName);
|
|
||||||
|
|
||||||
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
|
||||||
HdfsSupport.remove(resolvedPath, spark.sparkContext().hadoopConfiguration());
|
|
||||||
cleanCfHb(
|
|
||||||
spark, inputPath, entityClazz, resolvedPath, dsMasterDuplicatePath, outputPath);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> void cleanCfHb(SparkSession spark, String inputPath, Class<T> entityClazz,
|
|
||||||
String resolvedPath, String masterDuplicatePath, String outputPath) {
|
|
||||||
|
|
||||||
// read the master-duplicate tuples
|
|
||||||
Dataset<MasterDuplicate> md = spark
|
|
||||||
.read()
|
|
||||||
.textFile(masterDuplicatePath)
|
|
||||||
.map(as(MasterDuplicate.class), Encoders.bean(MasterDuplicate.class));
|
|
||||||
|
|
||||||
// prepare the resolved CF|HB references with the corresponding EMPTY master ID
|
|
||||||
Dataset<IdCfHbMapping> resolved = spark
|
|
||||||
.read()
|
|
||||||
.textFile(inputPath)
|
|
||||||
.map(as(entityClazz), Encoders.bean(entityClazz))
|
|
||||||
.flatMap(flattenCfHbFn(), Encoders.bean(IdCfHbMapping.class));
|
|
||||||
|
|
||||||
// set the EMPTY master ID/NAME and save it
|
|
||||||
resolved
|
|
||||||
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicateId")))
|
|
||||||
.map(asIdCfHbMapping(), Encoders.bean(IdCfHbMapping.class))
|
|
||||||
.filter((FilterFunction<IdCfHbMapping>) m -> Objects.nonNull(m.getMasterId()))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.json(resolvedPath);
|
|
||||||
|
|
||||||
// read again the resolved CF|HB mapping
|
|
||||||
Dataset<IdCfHbMapping> resolvedDS = spark
|
|
||||||
.read()
|
|
||||||
.textFile(resolvedPath)
|
|
||||||
.map(as(IdCfHbMapping.class), Encoders.bean(IdCfHbMapping.class));
|
|
||||||
|
|
||||||
// read the result table
|
|
||||||
Dataset<T> res = spark
|
|
||||||
.read()
|
|
||||||
.textFile(inputPath)
|
|
||||||
.map(as(entityClazz), Encoders.bean(entityClazz));
|
|
||||||
|
|
||||||
// Join the results with the resolved CF|HB mapping, apply the mapping and save it
|
|
||||||
res
|
|
||||||
.joinWith(resolvedDS, res.col("id").equalTo(resolvedDS.col("resultId")), "left")
|
|
||||||
.groupByKey((MapFunction<Tuple2<T, IdCfHbMapping>, String>) t -> t._1().getId(), Encoders.STRING())
|
|
||||||
.mapGroups(getMapGroupsFunction(), Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(outputPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping> asIdCfHbMapping() {
|
|
||||||
return t -> {
|
|
||||||
final IdCfHbMapping mapping = t._1();
|
|
||||||
Optional
|
|
||||||
.ofNullable(t._2())
|
|
||||||
.ifPresent(t2 -> {
|
|
||||||
mapping.setMasterId(t2.getMasterId());
|
|
||||||
mapping.setMasterName(t2.getMasterName());
|
|
||||||
|
|
||||||
});
|
|
||||||
return mapping;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> FlatMapFunction<T, IdCfHbMapping> flattenCfHbFn() {
|
|
||||||
return r -> Stream
|
|
||||||
.concat(
|
|
||||||
Optional
|
|
||||||
.ofNullable(r.getCollectedfrom())
|
|
||||||
.map(cf -> cf.stream().map(KeyValue::getKey))
|
|
||||||
.orElse(Stream.empty()),
|
|
||||||
Stream
|
|
||||||
.concat(
|
|
||||||
Optional
|
|
||||||
.ofNullable(r.getInstance())
|
|
||||||
.map(
|
|
||||||
instances -> instances
|
|
||||||
.stream()
|
|
||||||
.map(i -> Optional.ofNullable(i.getHostedby()).map(KeyValue::getKey).orElse("")))
|
|
||||||
.orElse(Stream.empty())
|
|
||||||
.filter(StringUtils::isNotBlank),
|
|
||||||
Optional
|
|
||||||
.ofNullable(r.getInstance())
|
|
||||||
.map(
|
|
||||||
instances -> instances
|
|
||||||
.stream()
|
|
||||||
.map(
|
|
||||||
i -> Optional
|
|
||||||
.ofNullable(i.getCollectedfrom())
|
|
||||||
.map(KeyValue::getKey)
|
|
||||||
.orElse("")))
|
|
||||||
.orElse(Stream.empty())
|
|
||||||
.filter(StringUtils::isNotBlank)))
|
|
||||||
.distinct()
|
|
||||||
.filter(StringUtils::isNotBlank)
|
|
||||||
.map(cfHb -> asIdCfHbMapping(r.getId(), cfHb))
|
|
||||||
.iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T> getMapGroupsFunction() {
|
|
||||||
return new MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T>() {
|
|
||||||
@Override
|
|
||||||
public T call(String key, Iterator<Tuple2<T, IdCfHbMapping>> values) {
|
|
||||||
final Tuple2<T, IdCfHbMapping> first = values.next();
|
|
||||||
final T res = first._1();
|
|
||||||
|
|
||||||
updateResult(res, first._2());
|
|
||||||
values.forEachRemaining(t -> updateResult(res, t._2()));
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateResult(T res, IdCfHbMapping m) {
|
|
||||||
if (Objects.nonNull(m)) {
|
|
||||||
res.getCollectedfrom().forEach(kv -> updateKeyValue(kv, m));
|
|
||||||
res.getInstance().forEach(i -> {
|
|
||||||
updateKeyValue(i.getHostedby(), m);
|
|
||||||
updateKeyValue(i.getCollectedfrom(), m);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
|
|
||||||
if (kv.getKey().equals(a.getCfhb())) {
|
|
||||||
kv.setKey(a.getMasterId());
|
|
||||||
kv.setValue(a.getMasterName());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private static IdCfHbMapping asIdCfHbMapping(String resultId, String cfHb) {
|
|
||||||
IdCfHbMapping m = new IdCfHbMapping(resultId);
|
|
||||||
m.setCfhb(cfHb);
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
|
||||||
return s -> OBJECT_MAPPER.readValue(s, clazz);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,211 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.country;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import javax.swing.text.html.Option;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author miriam.baglioni
|
|
||||||
* @Date 20/07/22
|
|
||||||
*/
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
|
||||||
|
|
||||||
public class CleanCountrySparkJob implements Serializable {
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanCountrySparkJob.class);
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
|
|
||||||
String jsonConfiguration = IOUtils
|
|
||||||
.toString(
|
|
||||||
CleanCountrySparkJob.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json"));
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
|
||||||
log.info("inputPath: {}", inputPath);
|
|
||||||
|
|
||||||
String workingDir = parser.get("workingDir");
|
|
||||||
log.info("workingDir: {}", workingDir);
|
|
||||||
|
|
||||||
String datasourcePath = parser.get("hostedBy");
|
|
||||||
log.info("datasourcePath: {}", datasourcePath);
|
|
||||||
|
|
||||||
String country = parser.get("country");
|
|
||||||
log.info("country: {}", country);
|
|
||||||
|
|
||||||
String[] verifyParam = parser.get("verifyParam").split(";");
|
|
||||||
log.info("verifyParam: {}", verifyParam);
|
|
||||||
|
|
||||||
String collectedfrom = parser.get("collectedfrom");
|
|
||||||
log.info("collectedfrom: {}", collectedfrom);
|
|
||||||
|
|
||||||
String graphTableClassName = parser.get("graphTableClassName");
|
|
||||||
log.info("graphTableClassName: {}", graphTableClassName);
|
|
||||||
|
|
||||||
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
|
|
||||||
cleanCountry(
|
|
||||||
spark, country, verifyParam, inputPath, entityClazz, workingDir, collectedfrom, datasourcePath);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> void cleanCountry(SparkSession spark, String country, String[] verifyParam,
|
|
||||||
String inputPath, Class<T> entityClazz, String workingDir, String collectedfrom, String datasourcePath) {
|
|
||||||
|
|
||||||
List<String> hostedBy = spark
|
|
||||||
.read()
|
|
||||||
.textFile(datasourcePath)
|
|
||||||
.collectAsList();
|
|
||||||
|
|
||||||
Dataset<T> res = spark
|
|
||||||
.read()
|
|
||||||
.textFile(inputPath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
|
||||||
Encoders.bean(entityClazz));
|
|
||||||
|
|
||||||
res.map((MapFunction<T, T>) r -> {
|
|
||||||
if (r.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
|
|
||||||
!r.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<StructuredProperty> ids = getPidsAndAltIds(r).collect(Collectors.toList());
|
|
||||||
if (ids
|
|
||||||
.stream()
|
|
||||||
.anyMatch(
|
|
||||||
p -> p
|
|
||||||
.getQualifier()
|
|
||||||
.getClassid()
|
|
||||||
.equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) {
|
|
||||||
r
|
|
||||||
.setCountry(
|
|
||||||
r
|
|
||||||
.getCountry()
|
|
||||||
.stream()
|
|
||||||
.filter(
|
|
||||||
c -> toTakeCountry(c, country))
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return r;
|
|
||||||
}, Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir);
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(workingDir)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
|
||||||
Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(inputPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
|
|
||||||
final Stream<StructuredProperty> resultPids = Optional
|
|
||||||
.ofNullable(r.getPid())
|
|
||||||
.map(Collection::stream)
|
|
||||||
.orElse(Stream.empty());
|
|
||||||
|
|
||||||
final Stream<StructuredProperty> instancePids = Optional
|
|
||||||
.ofNullable(r.getInstance())
|
|
||||||
.map(
|
|
||||||
instance -> instance
|
|
||||||
.stream()
|
|
||||||
.flatMap(
|
|
||||||
i -> Optional
|
|
||||||
.ofNullable(i.getPid())
|
|
||||||
.map(Collection::stream)
|
|
||||||
.orElse(Stream.empty())))
|
|
||||||
.orElse(Stream.empty());
|
|
||||||
|
|
||||||
final Stream<StructuredProperty> instanceAltIds = Optional
|
|
||||||
.ofNullable(r.getInstance())
|
|
||||||
.map(
|
|
||||||
instance -> instance
|
|
||||||
.stream()
|
|
||||||
.flatMap(
|
|
||||||
i -> Optional
|
|
||||||
.ofNullable(i.getAlternateIdentifier())
|
|
||||||
.map(Collection::stream)
|
|
||||||
.orElse(Stream.empty())))
|
|
||||||
.orElse(Stream.empty());
|
|
||||||
|
|
||||||
return Stream
|
|
||||||
.concat(
|
|
||||||
Stream.concat(resultPids, instancePids),
|
|
||||||
instanceAltIds);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean pidInParam(String value, String[] verifyParam) {
|
|
||||||
for (String s : verifyParam)
|
|
||||||
if (value.startsWith(s))
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean toTakeCountry(Country c, String country) {
|
|
||||||
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
|
|
||||||
// inserted via propagation
|
|
||||||
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
|
|
||||||
return true;
|
|
||||||
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
|
|
||||||
return true;
|
|
||||||
return !(c
|
|
||||||
.getClassid()
|
|
||||||
.equalsIgnoreCase(country) &&
|
|
||||||
c.getDataInfo().getInferenceprovenance().equals("propagation"));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -83,12 +83,57 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="fork_clean_graph"/>
|
<start to="prepare_info"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
|
<fork name="prepare_info">
|
||||||
|
<path start="select_datasourceId_from_country"/>
|
||||||
|
<path start="get_ds_master_duplicate"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="select_datasourceId_from_country">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Select datasource ID from country</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.clean.GetDatasourceFromCountry</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=10000
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${graphInputPath}</arg>
|
||||||
|
<arg>--workingDir</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_prepare"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="get_ds_master_duplicate">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction</main-class>
|
||||||
|
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
||||||
|
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||||
|
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||||
|
<arg>--hdfsPath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="wait_prepare"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="wait_prepare" to="fork_clean_graph"/>
|
||||||
|
|
||||||
<fork name="fork_clean_graph">
|
<fork name="fork_clean_graph">
|
||||||
<path start="clean_publication"/>
|
<path start="clean_publication"/>
|
||||||
<path start="clean_dataset"/>
|
<path start="clean_dataset"/>
|
||||||
|
@ -115,12 +160,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -141,12 +194,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -167,12 +228,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -193,12 +262,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -219,12 +296,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -245,12 +330,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -271,12 +364,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -297,486 +398,26 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=20000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait_clean" to="clean_context"/>
|
<join name="wait_clean" to="End"/>
|
||||||
|
|
||||||
<decision name="clean_context">
|
|
||||||
<switch>
|
|
||||||
<case to="fork_clean_context">${wf:conf('shouldClean') eq true}</case>
|
|
||||||
<default to="End"/>
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<fork name="fork_clean_context">
|
|
||||||
<path start="clean_publication_context"/>
|
|
||||||
<path start="clean_dataset_context"/>
|
|
||||||
<path start="clean_otherresearchproduct_context"/>
|
|
||||||
<path start="clean_software_context"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="clean_publication_context">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean publications context</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
|
|
||||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_context"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_dataset_context">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean datasets Context</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
|
|
||||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_context"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_otherresearchproduct_context">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean otherresearchproducts context</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
|
||||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_context"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_software_context">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean softwares context</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
|
|
||||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_context"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="wait_clean_context" to="select_datasourceId_from_country"/>
|
|
||||||
|
|
||||||
<action name="select_datasourceId_from_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Select datasource ID from country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.GetDatasourceFromCountry</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="fork_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<fork name="fork_clean_country">
|
|
||||||
<path start="clean_publication_country"/>
|
|
||||||
<path start="clean_dataset_country"/>
|
|
||||||
<path start="clean_otherresearchproduct_country"/>
|
|
||||||
<path start="clean_software_country"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="clean_publication_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean publication country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
|
||||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_dataset_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean dataset country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
|
||||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_otherresearchproduct_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean otherresearchproduct country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
|
||||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_software_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean software country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
|
||||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="wait_clean_country" to="should_patch_datasource_ids"/>
|
|
||||||
|
|
||||||
<decision name="should_patch_datasource_ids">
|
|
||||||
<switch>
|
|
||||||
<case to="get_ds_master_duplicate">${wf:conf('shouldClean') eq true}</case>
|
|
||||||
<default to="End"/>
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="get_ds_master_duplicate">
|
|
||||||
<java>
|
|
||||||
<main-class>eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction</main-class>
|
|
||||||
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
|
||||||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
|
||||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
|
||||||
<arg>--hdfsPath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="fork_patch_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<fork name="fork_patch_cfhb">
|
|
||||||
<path start="patch_publication_cfhb"/>
|
|
||||||
<path start="patch_dataset_cfhb"/>
|
|
||||||
<path start="patch_otherresearchproduct_cfhb"/>
|
|
||||||
<path start="patch_software_cfhb"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="patch_publication_cfhb">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>patch publication cfhb</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
|
||||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/publication</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/publication</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="patch_dataset_cfhb">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>patch dataset cfhb</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
|
||||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/dataset</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/dataset</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="patch_otherresearchproduct_cfhb">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>patch otherresearchproduct cfhb</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/otherresearchproduct</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="patch_software_cfhb">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>patch software cfhb</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
|
||||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/software</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/software</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="wait_clean_cfhb" to="fork_copy_cfhb_patched_results"/>
|
|
||||||
|
|
||||||
<fork name="fork_copy_cfhb_patched_results">
|
|
||||||
<path start="copy_cfhb_patched_publication"/>
|
|
||||||
<path start="copy_cfhb_patched_dataset"/>
|
|
||||||
<path start="copy_cfhb_patched_otherresearchproduct"/>
|
|
||||||
<path start="copy_cfhb_patched_software"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="copy_cfhb_patched_publication">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<prepare>
|
|
||||||
<delete path="${graphOutputPath}/publication"/>
|
|
||||||
</prepare>
|
|
||||||
<arg>${workingDir}/cfHbPatched/publication</arg>
|
|
||||||
<arg>${graphOutputPath}/publication</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_cfhb_patched_dataset">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<prepare>
|
|
||||||
<delete path="${graphOutputPath}/dataset"/>
|
|
||||||
</prepare>
|
|
||||||
<arg>${workingDir}/cfHbPatched/dataset</arg>
|
|
||||||
<arg>${graphOutputPath}/dataset</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_cfhb_patched_otherresearchproduct">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<prepare>
|
|
||||||
<delete path="${graphOutputPath}/otherresearchproduct"/>
|
|
||||||
</prepare>
|
|
||||||
<arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
|
|
||||||
<arg>${graphOutputPath}/otherresearchproduct</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_cfhb_patched_software">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<prepare>
|
|
||||||
<delete path="${graphOutputPath}/software"/>
|
|
||||||
</prepare>
|
|
||||||
<arg>${workingDir}/cfHbPatched/software</arg>
|
|
||||||
<arg>${graphOutputPath}/software</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="copy_wait" to="End"/>
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
|
|
|
@ -28,5 +28,53 @@
|
||||||
"paramLongName": "graphTableClassName",
|
"paramLongName": "graphTableClassName",
|
||||||
"paramDescription": "class name moelling the graph table",
|
"paramDescription": "class name moelling the graph table",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ci",
|
||||||
|
"paramLongName": "contextId",
|
||||||
|
"paramDescription": "the id of the context to be removed",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "vf",
|
||||||
|
"paramLongName": "verifyParam",
|
||||||
|
"paramDescription": "the parameter to be verified to remove the context",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "c",
|
||||||
|
"paramLongName": "country",
|
||||||
|
"paramDescription": "the id of the context to be removed",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "vfc",
|
||||||
|
"paramLongName": "verifyCountryParam",
|
||||||
|
"paramDescription": "the parameter to be verified to remove the country",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "cf",
|
||||||
|
"paramLongName": "collectedfrom",
|
||||||
|
"paramDescription": "the collectedfrom value for which we should apply the cleaning",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "hb",
|
||||||
|
"paramLongName": "hostedBy",
|
||||||
|
"paramDescription": "the set of datasources having the specified country in the graph searched for in the hostedby of the results",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "md",
|
||||||
|
"paramLongName": "masterDuplicatePath",
|
||||||
|
"paramDescription": "path to the file on HDFS holding the datasource id tuples [master, duplicate]",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "dc",
|
||||||
|
"paramLongName": "deepClean",
|
||||||
|
"paramDescription": "flag to activate further cleaning steps",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,289 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.junit.jupiter.api.AfterAll;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
|
|
||||||
public class CleanContextTest {
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
private static SparkSession spark;
|
|
||||||
|
|
||||||
private static Path workingDir;
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
|
|
||||||
|
|
||||||
@BeforeAll
|
|
||||||
public static void beforeAll() throws IOException {
|
|
||||||
workingDir = Files.createTempDirectory(CleanContextTest.class.getSimpleName());
|
|
||||||
log.info("using work dir {}", workingDir);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
conf.setAppName(CleanContextTest.class.getSimpleName());
|
|
||||||
|
|
||||||
conf.setMaster("local[*]");
|
|
||||||
conf.set("spark.driver.host", "localhost");
|
|
||||||
conf.set("hive.metastore.local", "true");
|
|
||||||
conf.set("spark.ui.enabled", "false");
|
|
||||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
|
||||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
|
||||||
|
|
||||||
spark = SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(CleanContextTest.class.getSimpleName())
|
|
||||||
.config(conf)
|
|
||||||
.getOrCreate();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
|
||||||
public static void afterAll() throws IOException {
|
|
||||||
FileUtils.deleteDirectory(workingDir.toFile());
|
|
||||||
spark.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testResultClean() throws Exception {
|
|
||||||
final String sourcePath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_context.json")
|
|
||||||
.getPath();
|
|
||||||
final String prefix = "gcube ";
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(sourcePath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class),
|
|
||||||
Encoders.bean(Publication.class))
|
|
||||||
.write()
|
|
||||||
.json(workingDir.toString() + "/publication");
|
|
||||||
|
|
||||||
CleanContextSparkJob.main(new String[] {
|
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
|
||||||
"--inputPath", workingDir.toString() + "/publication",
|
|
||||||
"--graphTableClassName", Publication.class.getCanonicalName(),
|
|
||||||
"--workingDir", workingDir.toString() + "/working",
|
|
||||||
"--contextId", "sobigdata",
|
|
||||||
"--verifyParam", "gCube "
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
JavaRDD<Publication> tmp = sc
|
|
||||||
.textFile(workingDir.toString() + "/publication")
|
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
|
|
||||||
|
|
||||||
Assertions.assertEquals(7, tmp.count());
|
|
||||||
|
|
||||||
// original result with sobigdata context and gcube as starting string in the main title for the publication
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
0,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
|
|
||||||
// original result with sobigdata context without gcube as starting string in the main title for the publication
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"sobigdata::projects::2",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
|
|
||||||
// original result with sobigdata context with gcube as starting string in the subtitle
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"sobigdata::projects::2",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
List<StructuredProperty> titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(1, titles.size());
|
|
||||||
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
|
||||||
Assertions.assertEquals("subtitle", titles.get(0).getQualifier().getClassid());
|
|
||||||
|
|
||||||
// original result with sobigdata context with gcube not as starting string in the main title
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"sobigdata::projects::1",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(1, titles.size());
|
|
||||||
Assertions.assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
|
||||||
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim()));
|
|
||||||
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
|
||||||
|
|
||||||
// original result with sobigdata in context and also other contexts with gcube as starting string for the main
|
|
||||||
// title
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"dh-ch",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(1, titles.size());
|
|
||||||
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
|
||||||
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
|
||||||
|
|
||||||
// original result with multiple main title one of which whith gcube as starting string and with 2 contextes
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"dh-ch",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(2, titles.size());
|
|
||||||
Assertions
|
|
||||||
.assertTrue(
|
|
||||||
titles
|
|
||||||
.stream()
|
|
||||||
.anyMatch(
|
|
||||||
t -> t.getQualifier().getClassid().equals("main title")
|
|
||||||
&& t.getValue().toLowerCase().startsWith(prefix)));
|
|
||||||
|
|
||||||
// original result without sobigdata in context with gcube as starting string for the main title
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"dh-ch",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(2, titles.size());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertTrue(
|
|
||||||
titles
|
|
||||||
.stream()
|
|
||||||
.anyMatch(
|
|
||||||
t -> t.getQualifier().getClassid().equals("main title")
|
|
||||||
&& t.getValue().toLowerCase().startsWith(prefix)));
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,190 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.junit.jupiter.api.AfterAll;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author miriam.baglioni
|
|
||||||
* @Date 20/07/22
|
|
||||||
*/
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
|
|
||||||
public class CleanCountryTest {
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
private static SparkSession spark;
|
|
||||||
|
|
||||||
private static Path workingDir;
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
|
|
||||||
|
|
||||||
@BeforeAll
|
|
||||||
public static void beforeAll() throws IOException {
|
|
||||||
workingDir = Files.createTempDirectory(CleanCountryTest.class.getSimpleName());
|
|
||||||
log.info("using work dir {}", workingDir);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
conf.setAppName(CleanCountryTest.class.getSimpleName());
|
|
||||||
|
|
||||||
conf.setMaster("local[*]");
|
|
||||||
conf.set("spark.driver.host", "localhost");
|
|
||||||
conf.set("hive.metastore.local", "true");
|
|
||||||
conf.set("spark.ui.enabled", "false");
|
|
||||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
|
||||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
|
||||||
|
|
||||||
spark = SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(CleanCountryTest.class.getSimpleName())
|
|
||||||
.config(conf)
|
|
||||||
.getOrCreate();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
|
||||||
public static void afterAll() throws IOException {
|
|
||||||
FileUtils.deleteDirectory(workingDir.toFile());
|
|
||||||
spark.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testResultClean() throws Exception {
|
|
||||||
final String sourcePath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_country.json")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(sourcePath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class),
|
|
||||||
Encoders.bean(Publication.class))
|
|
||||||
.write()
|
|
||||||
.json(workingDir.toString() + "/publication");
|
|
||||||
|
|
||||||
CleanCountrySparkJob.main(new String[] {
|
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
|
||||||
"--inputPath", workingDir.toString() + "/publication",
|
|
||||||
"--graphTableClassName", Publication.class.getCanonicalName(),
|
|
||||||
"--workingDir", workingDir.toString() + "/working",
|
|
||||||
"--country", "NL",
|
|
||||||
"--verifyParam", "10.17632",
|
|
||||||
"--collectedfrom", "NARCIS",
|
|
||||||
"--hostedBy", getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
|
|
||||||
.getPath()
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
JavaRDD<Publication> tmp = sc
|
|
||||||
.textFile(workingDir.toString() + "/publication")
|
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
|
|
||||||
|
|
||||||
Assertions.assertEquals(8, tmp.count());
|
|
||||||
|
|
||||||
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getCountry()
|
|
||||||
.size());
|
|
||||||
|
|
||||||
// original result with NL country and pid not starting with Mendely prefix
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getCountry()
|
|
||||||
.size());
|
|
||||||
|
|
||||||
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS but not
|
|
||||||
// inserted with propagation
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getCountry()
|
|
||||||
.size());
|
|
||||||
|
|
||||||
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS inserted with
|
|
||||||
// propagation
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
0,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6ag"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getCountry()
|
|
||||||
.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testDatasetClean() throws Exception {
|
|
||||||
final String sourcePath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(sourcePath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, Dataset>) r -> OBJECT_MAPPER.readValue(r, Dataset.class),
|
|
||||||
Encoders.bean(Dataset.class))
|
|
||||||
.write()
|
|
||||||
.json(workingDir.toString() + "/dataset");
|
|
||||||
|
|
||||||
CleanCountrySparkJob.main(new String[] {
|
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
|
||||||
"--inputPath", workingDir.toString() + "/dataset",
|
|
||||||
"-graphTableClassName", Dataset.class.getCanonicalName(),
|
|
||||||
"-workingDir", workingDir.toString() + "/working",
|
|
||||||
"-country", "NL",
|
|
||||||
"-verifyParam", "10.17632",
|
|
||||||
"-collectedfrom", "NARCIS",
|
|
||||||
"-hostedBy", getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
|
|
||||||
.getPath()
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
JavaRDD<Dataset> tmp = sc
|
|
||||||
.textFile(workingDir.toString() + "/dataset")
|
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
|
||||||
|
|
||||||
Assertions.assertEquals(1, tmp.count());
|
|
||||||
|
|
||||||
Assertions.assertEquals(0, tmp.first().getCountry().size());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -0,0 +1,924 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.io.filefilter.FalseFileFilter;
|
||||||
|
import org.apache.commons.io.filefilter.TrueFileFilter;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.junit.jupiter.api.AfterAll;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
public class CleanGraphSparkJobTest {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJobTest.class);
|
||||||
|
|
||||||
|
public static final ObjectMapper MAPPER = new ObjectMapper()
|
||||||
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private ISLookUpService isLookUpService;
|
||||||
|
|
||||||
|
private VocabularyGroup vocabularies;
|
||||||
|
|
||||||
|
private CleaningRuleMap mapping;
|
||||||
|
|
||||||
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
private static Path testBaseTmpPath;
|
||||||
|
|
||||||
|
private static String graphInputPath;
|
||||||
|
|
||||||
|
private static String graphOutputPath;
|
||||||
|
|
||||||
|
private static String dsMasterDuplicatePath;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void beforeAll() throws IOException, URISyntaxException {
|
||||||
|
testBaseTmpPath = Files.createTempDirectory(CleanGraphSparkJobTest.class.getSimpleName());
|
||||||
|
log.info("using test base path {}", testBaseTmpPath);
|
||||||
|
|
||||||
|
File basePath = Paths
|
||||||
|
.get(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
CleanGraphSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/graph"))
|
||||||
|
.toURI())
|
||||||
|
.toFile();
|
||||||
|
|
||||||
|
List<File> paths = FileUtils
|
||||||
|
.listFilesAndDirs(basePath, FalseFileFilter.FALSE, TrueFileFilter.TRUE)
|
||||||
|
.stream()
|
||||||
|
.filter(f -> !f.getAbsolutePath().endsWith("/graph"))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
for (File path : paths) {
|
||||||
|
String type = StringUtils.substringAfterLast(path.getAbsolutePath(), "/");
|
||||||
|
FileUtils
|
||||||
|
.copyDirectory(
|
||||||
|
path,
|
||||||
|
testBaseTmpPath.resolve("input").resolve("graph").resolve(type).toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
FileUtils
|
||||||
|
.copyFileToDirectory(
|
||||||
|
Paths
|
||||||
|
.get(
|
||||||
|
CleanGraphSparkJobTest.class
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json")
|
||||||
|
.toURI())
|
||||||
|
.toFile(),
|
||||||
|
testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile());
|
||||||
|
|
||||||
|
graphInputPath = testBaseTmpPath.resolve("input").resolve("graph").toString();
|
||||||
|
graphOutputPath = testBaseTmpPath.resolve("output").resolve("graph").toString();
|
||||||
|
dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString();
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setAppName(CleanGraphSparkJobTest.class.getSimpleName());
|
||||||
|
|
||||||
|
conf.setMaster("local[*]");
|
||||||
|
conf.set("spark.driver.host", "localhost");
|
||||||
|
conf.set("hive.metastore.local", "true");
|
||||||
|
conf.set("spark.ui.enabled", "false");
|
||||||
|
conf.set("spark.sql.warehouse.dir", testBaseTmpPath.toString());
|
||||||
|
conf.set("hive.metastore.warehouse.dir", testBaseTmpPath.resolve("warehouse").toString());
|
||||||
|
|
||||||
|
spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
|
.getOrCreate();
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws ISLookUpException, IOException {
|
||||||
|
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
||||||
|
lenient()
|
||||||
|
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
||||||
|
.thenReturn(synonyms());
|
||||||
|
|
||||||
|
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||||
|
mapping = CleaningRuleMap.create(vocabularies);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void afterAll() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
|
||||||
|
spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanRelations() throws Exception {
|
||||||
|
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.textFile(graphInputPath.toString() + "/relation")
|
||||||
|
.map(as(Relation.class), Encoders.bean(Relation.class))
|
||||||
|
.collectAsList()
|
||||||
|
.forEach(
|
||||||
|
r -> assertFalse(
|
||||||
|
vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r.getRelClass())));
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/relation",
|
||||||
|
"--outputPath", graphOutputPath + "/relation",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Relation.class.getCanonicalName(),
|
||||||
|
"--deepClean", "false",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.textFile(graphOutputPath.toString() + "/relation")
|
||||||
|
.map(as(Relation.class), Encoders.bean(Relation.class))
|
||||||
|
.collectAsList()
|
||||||
|
.forEach(r -> {
|
||||||
|
|
||||||
|
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r.getRelClass()));
|
||||||
|
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r.getSubRelType()));
|
||||||
|
|
||||||
|
assertEquals("iis", r.getProvenance().get(0).getDataInfo().getProvenanceaction().getClassid());
|
||||||
|
assertEquals("Inferred by OpenAIRE", r.getProvenance().get(0).getDataInfo().getProvenanceaction().getClassname());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFilter_invisible_true() throws Exception {
|
||||||
|
|
||||||
|
assertNotNull(vocabularies);
|
||||||
|
assertNotNull(mapping);
|
||||||
|
|
||||||
|
String json = IOUtils
|
||||||
|
.toString(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json")));
|
||||||
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||||
|
|
||||||
|
assertTrue(p_in instanceof Result);
|
||||||
|
assertTrue(p_in instanceof Publication);
|
||||||
|
|
||||||
|
assertEquals(true, GraphCleaningFunctions.filter(p_in));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFilter_true_nothing_to_filter() throws Exception {
|
||||||
|
|
||||||
|
assertNotNull(vocabularies);
|
||||||
|
assertNotNull(mapping);
|
||||||
|
|
||||||
|
String json = IOUtils
|
||||||
|
.toString(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")));
|
||||||
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||||
|
|
||||||
|
assertTrue(p_in instanceof Result);
|
||||||
|
assertTrue(p_in instanceof Publication);
|
||||||
|
|
||||||
|
assertEquals(true, GraphCleaningFunctions.filter(p_in));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFilter_missing_invisible() throws Exception {
|
||||||
|
|
||||||
|
assertNotNull(vocabularies);
|
||||||
|
assertNotNull(mapping);
|
||||||
|
|
||||||
|
String json = IOUtils
|
||||||
|
.toString(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass()
|
||||||
|
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json")));
|
||||||
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||||
|
|
||||||
|
assertTrue(p_in instanceof Result);
|
||||||
|
assertTrue(p_in instanceof Publication);
|
||||||
|
|
||||||
|
assertEquals(true, GraphCleaningFunctions.filter(p_in));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleaning_publication() throws Exception {
|
||||||
|
|
||||||
|
final String id = "50|CSC_________::2250a70c903c6ac6e4c01438259e9375";
|
||||||
|
|
||||||
|
Publication p_in = read(spark, graphInputPath + "/publication", Publication.class)
|
||||||
|
.filter(String.format("id = '%s'", id))
|
||||||
|
.first();
|
||||||
|
|
||||||
|
assertNull(p_in.getBestaccessright());
|
||||||
|
assertTrue(p_in instanceof Result);
|
||||||
|
assertTrue(p_in instanceof Publication);
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "false",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
Publication p = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter(String.format("id = '%s'", id))
|
||||||
|
.first();
|
||||||
|
|
||||||
|
assertNull(p.getPublisher());
|
||||||
|
|
||||||
|
assertEquals("und", p.getLanguage().getClassid());
|
||||||
|
assertEquals("Undetermined", p.getLanguage().getClassname());
|
||||||
|
|
||||||
|
assertEquals("DE", p.getCountry().get(0).getClassid());
|
||||||
|
assertEquals("Germany", p.getCountry().get(0).getClassname());
|
||||||
|
|
||||||
|
assertEquals("0018", p.getInstance().get(0).getInstancetype().getClassid());
|
||||||
|
assertEquals("Annotation", p.getInstance().get(0).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
assertEquals("0027", p.getInstance().get(1).getInstancetype().getClassid());
|
||||||
|
assertEquals("Model", p.getInstance().get(1).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
assertEquals("0038", p.getInstance().get(2).getInstancetype().getClassid());
|
||||||
|
assertEquals("Other literature type", p.getInstance().get(2).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
assertEquals("CLOSED", p.getInstance().get(0).getAccessright().getClassid());
|
||||||
|
assertEquals("Closed Access", p.getInstance().get(0).getAccessright().getClassname());
|
||||||
|
|
||||||
|
Set<String> pidTerms = vocabularies.getTerms(ModelConstants.DNET_PID_TYPES);
|
||||||
|
assertTrue(
|
||||||
|
p
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(StructuredProperty::getQualifier)
|
||||||
|
.allMatch(q -> pidTerms.contains(q.getClassid())));
|
||||||
|
|
||||||
|
List<Instance> poi = p.getInstance();
|
||||||
|
assertNotNull(poi);
|
||||||
|
assertEquals(3, poi.size());
|
||||||
|
|
||||||
|
final Instance poii = poi.get(0);
|
||||||
|
assertNotNull(poii);
|
||||||
|
assertNotNull(poii.getPid());
|
||||||
|
|
||||||
|
assertEquals(2, poii.getPid().size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
||||||
|
assertTrue(poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
|
||||||
|
|
||||||
|
assertNotNull(poii.getAlternateIdentifier());
|
||||||
|
assertEquals(1, poii.getAlternateIdentifier().size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
poii
|
||||||
|
.getAlternateIdentifier()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||||
|
|
||||||
|
assertEquals(3, p.getTitle().size());
|
||||||
|
|
||||||
|
List<String> titles = p
|
||||||
|
.getTitle()
|
||||||
|
.stream()
|
||||||
|
.map(StructuredProperty::getValue)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
assertTrue(titles.contains("omic"));
|
||||||
|
assertTrue(
|
||||||
|
titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test"));
|
||||||
|
assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"));
|
||||||
|
|
||||||
|
assertEquals("CLOSED", p.getBestaccessright().getClassid());
|
||||||
|
assertNull(p.getPublisher());
|
||||||
|
|
||||||
|
assertEquals("1970-10-07", p.getDateofacceptance());
|
||||||
|
|
||||||
|
assertEquals("0038", p.getInstance().get(2).getInstancetype().getClassid());
|
||||||
|
assertEquals("Other literature type", p.getInstance().get(2).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
final List<Instance> pci = p.getInstance();
|
||||||
|
assertNotNull(pci);
|
||||||
|
assertEquals(3, pci.size());
|
||||||
|
|
||||||
|
final Instance pcii = pci.get(0);
|
||||||
|
assertNotNull(pcii);
|
||||||
|
assertNotNull(pcii.getPid());
|
||||||
|
|
||||||
|
assertEquals(2, pcii.getPid().size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
||||||
|
assertTrue(pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
|
||||||
|
|
||||||
|
assertNotNull(pcii.getAlternateIdentifier());
|
||||||
|
assertEquals(1, pcii.getAlternateIdentifier().size());
|
||||||
|
assertTrue(
|
||||||
|
pcii
|
||||||
|
.getAlternateIdentifier()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||||
|
|
||||||
|
assertNotNull(p.getSubject());
|
||||||
|
|
||||||
|
List<Subject> fos_subjects = p
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
assertNotNull(fos_subjects);
|
||||||
|
assertEquals(2, fos_subjects.size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
fos_subjects
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
s -> "0101 mathematics".equals(s.getValue()) &
|
||||||
|
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
|
||||||
|
"sysimport:crosswalk:datasetarchive"
|
||||||
|
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
fos_subjects
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
s -> "0102 computer and information sciences".equals(s.getValue()) &
|
||||||
|
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
|
||||||
|
|
||||||
|
verify_keyword(p, "In Situ Hybridization");
|
||||||
|
verify_keyword(p, "Avicennia");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanDoiBoost() throws IOException, ParseException, ISLookUpException, ClassNotFoundException {
|
||||||
|
verifyFiltering(1, "50|doi_________::b0baa0eb88a5788f0b8815560d2a32f2");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanDoiBoost2() throws IOException, ParseException, ISLookUpException, ClassNotFoundException {
|
||||||
|
verifyFiltering(1, "50|doi_________::4972b0ca81b96b225aed8038bb965656");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyFiltering(int expectedCount, String id)
|
||||||
|
throws ISLookUpException, ClassNotFoundException, IOException, ParseException {
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "false",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
Dataset<Publication> p = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter(String.format("id = '%s'", id));
|
||||||
|
|
||||||
|
assertEquals(expectedCount, p.count());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanContext() throws Exception {
|
||||||
|
final String prefix = "gcube ";
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "true",
|
||||||
|
"--contextId", "sobigdata",
|
||||||
|
"--verifyParam", "gCube ",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
"--country", "NL",
|
||||||
|
"--verifyCountryParam", "10.17632",
|
||||||
|
"--collectedfrom", "NARCIS",
|
||||||
|
"--hostedBy", Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy"))
|
||||||
|
.getPath()
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
Dataset<Publication> pubs = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter((FilterFunction<Publication>) p1 -> StringUtils.endsWith(p1.getId(), "_ctx"));
|
||||||
|
|
||||||
|
assertEquals(7, pubs.count());
|
||||||
|
|
||||||
|
// original result with sobigdata context and gcube as starting string in the main title for the publication
|
||||||
|
assertEquals(
|
||||||
|
0,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::0224aae28af558f21768dbc6439a_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
|
||||||
|
// original result with sobigdata context without gcube as starting string in the main title for the publication
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67d_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"sobigdata::projects::2",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67d_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
|
||||||
|
// original result with sobigdata context with gcube as starting string in the subtitle
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"sobigdata::projects::2",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
|
||||||
|
List<StructuredProperty> titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(1, titles.size());
|
||||||
|
assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
||||||
|
assertEquals("subtitle", titles.get(0).getQualifier().getClassid());
|
||||||
|
|
||||||
|
// original result with sobigdata context with gcube not as starting string in the main title
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"sobigdata::projects::1",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(1, titles.size());
|
||||||
|
assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
||||||
|
assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim()));
|
||||||
|
assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
||||||
|
|
||||||
|
// original result with sobigdata in context and also other contexts with gcube as starting string for the main
|
||||||
|
// title
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"dh-ch",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(1, titles.size());
|
||||||
|
assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
||||||
|
assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
||||||
|
|
||||||
|
// original result with multiple main title one of which whith gcube as starting string and with 2 contextes
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"dh-ch",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(2, titles.size());
|
||||||
|
assertTrue(
|
||||||
|
titles
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
t -> t.getQualifier().getClassid().equals("main title")
|
||||||
|
&& t.getValue().toLowerCase().startsWith(prefix)));
|
||||||
|
|
||||||
|
// original result without sobigdata in context with gcube as starting string for the main title
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"dh-ch",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(2, titles.size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
titles
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
t -> t.getQualifier().getClassid().equals("main title")
|
||||||
|
&& t.getValue().toLowerCase().startsWith(prefix)));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanCfHbSparkJob() throws Exception {
|
||||||
|
|
||||||
|
final Dataset<Publication> pubs_in = read(spark, graphInputPath + "/publication", Publication.class);
|
||||||
|
final Publication p1_in = pubs_in
|
||||||
|
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13b_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", p1_in.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("Bacterial Protein Interaction Database - DUP", p1_in.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc",
|
||||||
|
p1_in.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"Bacterial Protein Interaction Database - DUP", p1_in.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
|
||||||
|
final Publication p2_in = pubs_in
|
||||||
|
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3a_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", p2_in.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("FILUR DATA - DUP", p2_in.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35",
|
||||||
|
p2_in.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals("FILUR DATA - DUP", p2_in.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|re3data_____::6ffd7bc058f762912dc494cd9c175341", p2_in.getInstance().get(0).getHostedby().getKey());
|
||||||
|
assertEquals("depositar - DUP", p2_in.getInstance().get(0).getHostedby().getValue());
|
||||||
|
|
||||||
|
final Publication p3_in = pubs_in
|
||||||
|
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7d_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_in.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("DANS (Data Archiving and Networked Services)", p3_in.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f",
|
||||||
|
p3_in.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"DANS (Data Archiving and Networked Services)", p3_in.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_in.getInstance().get(0).getHostedby().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"DANS (Data Archiving and Networked Services)", p3_in.getInstance().get(0).getHostedby().getValue());
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "true",
|
||||||
|
"--contextId", "sobigdata",
|
||||||
|
"--verifyParam", "gCube ",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
"--country", "NL",
|
||||||
|
"--verifyCountryParam", "10.17632",
|
||||||
|
"--collectedfrom", "NARCIS",
|
||||||
|
"--hostedBy", Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy"))
|
||||||
|
.getPath()
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
assertTrue(Files.exists(Paths.get(graphOutputPath, "publication")));
|
||||||
|
|
||||||
|
final Dataset<Publication> pubs_out = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter((FilterFunction<Publication>) p -> StringUtils.endsWith(p.getId(), "_cfhb"));
|
||||||
|
|
||||||
|
assertEquals(3, pubs_out.count());
|
||||||
|
|
||||||
|
final Publication p1_out = pubs_out
|
||||||
|
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13b_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", p1_out.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("Bacterial Protein Interaction Database", p1_out.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce",
|
||||||
|
p1_out.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"Bacterial Protein Interaction Database", p1_out.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
|
||||||
|
final Publication p2_out = pubs_out
|
||||||
|
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3a_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|re3data_____::fc1db64b3964826913b1e9eafe830490", p2_out.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("FULIR Data", p2_out.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|re3data_____::fc1db64b3964826913b1e9eafe830490",
|
||||||
|
p2_out.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals("FULIR Data", p2_out.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|fairsharing_::3f647cadf56541fb9513cb63ec370187", p2_out.getInstance().get(0).getHostedby().getKey());
|
||||||
|
assertEquals("depositar", p2_out.getInstance().get(0).getHostedby().getValue());
|
||||||
|
|
||||||
|
final Publication p3_out = pubs_out
|
||||||
|
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7d_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_out.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("DANS (Data Archiving and Networked Services)", p3_out.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f",
|
||||||
|
p3_out.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"DANS (Data Archiving and Networked Services)", p3_out.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_out.getInstance().get(0).getHostedby().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"DANS (Data Archiving and Networked Services)", p3_out.getInstance().get(0).getHostedby().getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanCountry() throws Exception {
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "true",
|
||||||
|
"--contextId", "sobigdata",
|
||||||
|
"--verifyParam", "gCube ",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
"--country", "NL",
|
||||||
|
"--verifyCountryParam", "10.17632",
|
||||||
|
"--collectedfrom", "NARCIS",
|
||||||
|
"--hostedBy", Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy"))
|
||||||
|
.getPath()
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
final Dataset<Publication> pubs_out = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter((FilterFunction<Publication>) p -> StringUtils.endsWith(p.getId(), "_country"));
|
||||||
|
|
||||||
|
assertEquals(8, pubs_out.count());
|
||||||
|
|
||||||
|
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs_out
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::0224aae28af558f21768dbc6_country"))
|
||||||
|
.first()
|
||||||
|
.getCountry()
|
||||||
|
.size());
|
||||||
|
|
||||||
|
// original result with NL country and pid not starting with Mendely prefix
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs_out
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1_country"))
|
||||||
|
.first()
|
||||||
|
.getCountry()
|
||||||
|
.size());
|
||||||
|
|
||||||
|
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS but not
|
||||||
|
// inserted with propagation
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs_out
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817e_country"))
|
||||||
|
.first()
|
||||||
|
.getCountry()
|
||||||
|
.size());
|
||||||
|
|
||||||
|
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS inserted with
|
||||||
|
// propagation
|
||||||
|
assertEquals(
|
||||||
|
0,
|
||||||
|
pubs_out
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817d_country"))
|
||||||
|
.first()
|
||||||
|
.getCountry()
|
||||||
|
.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> vocs() throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.readLines(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")));
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> synonyms() throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.readLines(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")));
|
||||||
|
}
|
||||||
|
|
||||||
|
private <R> org.apache.spark.sql.Dataset<R> read(SparkSession spark, String path, Class<R> clazz) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(path)
|
||||||
|
.map(as(clazz), Encoders.bean(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
||||||
|
return s -> MAPPER.readValue(s, clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String classPathResourceAsString(String path) throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.toString(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
CleanGraphSparkJobTest.class.getResourceAsStream(path)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
|
||||||
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
return parser;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void verify_keyword(Publication p_cleaned, String subject) {
|
||||||
|
Optional<Subject> s1 = p_cleaned
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.filter(s -> s.getValue().equals(subject))
|
||||||
|
.findFirst();
|
||||||
|
|
||||||
|
assertTrue(s1.isPresent());
|
||||||
|
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassid());
|
||||||
|
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassname());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -13,7 +13,6 @@ import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.MappableBlock;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
@ -59,7 +58,7 @@ public class GraphCleaningFunctionsTest {
|
||||||
void testCleanRelations() throws Exception {
|
void testCleanRelations() throws Exception {
|
||||||
|
|
||||||
List<String> lines = IOUtils
|
List<String> lines = IOUtils
|
||||||
.readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/relation.json"));
|
.readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/graph/relation/relation.json"));
|
||||||
for (String json : lines) {
|
for (String json : lines) {
|
||||||
Relation r_in = MAPPER.readValue(json, Relation.class);
|
Relation r_in = MAPPER.readValue(json, Relation.class);
|
||||||
assertNotNull(r_in);
|
assertNotNull(r_in);
|
||||||
|
|
|
@ -1,213 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.junit.jupiter.api.*;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
|
|
||||||
public class CleanCfHbSparkJobTest {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJobTest.class);
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
private static SparkSession spark;
|
|
||||||
|
|
||||||
private static Path testBaseTmpPath;
|
|
||||||
|
|
||||||
private static String resolvedPath;
|
|
||||||
|
|
||||||
private static String graphInputPath;
|
|
||||||
|
|
||||||
private static String graphOutputPath;
|
|
||||||
|
|
||||||
private static String dsMasterDuplicatePath;
|
|
||||||
|
|
||||||
@BeforeAll
|
|
||||||
public static void beforeAll() throws IOException, URISyntaxException {
|
|
||||||
|
|
||||||
testBaseTmpPath = Files.createTempDirectory(CleanCfHbSparkJobTest.class.getSimpleName());
|
|
||||||
log.info("using test base path {}", testBaseTmpPath);
|
|
||||||
|
|
||||||
final File entitiesSources = Paths
|
|
||||||
.get(CleanCfHbSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/entities").toURI())
|
|
||||||
.toFile();
|
|
||||||
|
|
||||||
FileUtils
|
|
||||||
.copyDirectory(
|
|
||||||
entitiesSources,
|
|
||||||
testBaseTmpPath.resolve("input").resolve("entities").toFile());
|
|
||||||
|
|
||||||
FileUtils
|
|
||||||
.copyFileToDirectory(
|
|
||||||
Paths
|
|
||||||
.get(
|
|
||||||
CleanCfHbSparkJobTest.class
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json")
|
|
||||||
.toURI())
|
|
||||||
.toFile(),
|
|
||||||
testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile());
|
|
||||||
|
|
||||||
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
|
|
||||||
resolvedPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbResolved").toString();
|
|
||||||
graphOutputPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbPatched").toString();
|
|
||||||
dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString();
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
conf.setAppName(CleanCfHbSparkJobTest.class.getSimpleName());
|
|
||||||
|
|
||||||
conf.setMaster("local[*]");
|
|
||||||
conf.set("spark.driver.host", "localhost");
|
|
||||||
conf.set("spark.ui.enabled", "false");
|
|
||||||
|
|
||||||
spark = SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(CleanCfHbSparkJobTest.class.getSimpleName())
|
|
||||||
.config(conf)
|
|
||||||
.getOrCreate();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
|
||||||
public static void afterAll() throws IOException {
|
|
||||||
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
|
|
||||||
spark.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testCleanCfHbSparkJob() throws Exception {
|
|
||||||
final String outputPath = graphOutputPath + "/dataset";
|
|
||||||
final String inputPath = graphInputPath + "/dataset";
|
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Dataset> records = read(spark, inputPath, Dataset.class);
|
|
||||||
Dataset d = records
|
|
||||||
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("Bacterial Protein Interaction Database - DUP", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals(
|
|
||||||
"Bacterial Protein Interaction Database - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("FILUR DATA - DUP", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals("FILUR DATA - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|re3data_____::6ffd7bc058f762912dc494cd9c175341", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("depositar - DUP", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals(
|
|
||||||
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
|
|
||||||
CleanCfHbSparkJob
|
|
||||||
.main(
|
|
||||||
new String[] {
|
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
|
||||||
"--inputPath", inputPath,
|
|
||||||
"--outputPath", outputPath,
|
|
||||||
"--resolvedPath", resolvedPath + "/dataset",
|
|
||||||
"--graphTableClassName", Dataset.class.getCanonicalName(),
|
|
||||||
"--masterDuplicatePath", dsMasterDuplicatePath
|
|
||||||
});
|
|
||||||
|
|
||||||
assertTrue(Files.exists(Paths.get(graphOutputPath, "dataset")));
|
|
||||||
|
|
||||||
records = read(spark, outputPath, Dataset.class);
|
|
||||||
|
|
||||||
assertEquals(3, records.count());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("Bacterial Protein Interaction Database", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals("Bacterial Protein Interaction Database", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("FULIR Data", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals("FULIR Data", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|fairsharing_::3f647cadf56541fb9513cb63ec370187", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("depositar", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals(
|
|
||||||
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals(
|
|
||||||
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
private <R> org.apache.spark.sql.Dataset<R> read(SparkSession spark, String path, Class<R> clazz) {
|
|
||||||
return spark
|
|
||||||
.read()
|
|
||||||
.textFile(path)
|
|
||||||
.map(as(clazz), Encoders.bean(clazz));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
|
||||||
return s -> OBJECT_MAPPER.readValue(s, clazz);
|
|
||||||
}
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,11 @@
|
||||||
|
<configuration>
|
||||||
|
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||||
|
<encoder>
|
||||||
|
<pattern>%d{HH:mm:ss.SSS} %-5level %logger{36} - %msg%n</pattern>
|
||||||
|
</encoder>
|
||||||
|
</appender>
|
||||||
|
|
||||||
|
<root level="info">
|
||||||
|
<appender-ref ref="STDOUT" />
|
||||||
|
</root>
|
||||||
|
</configuration>
|
Loading…
Reference in New Issue