WIP merged from graph_cleaning_refactoring, applying model simplification

This commit is contained in:
Claudio Atzori 2023-04-26 16:02:06 +02:00
commit 8c9a77d7eb
33 changed files with 1956 additions and 2720 deletions

View File

@ -86,121 +86,10 @@ public class ModelConstants {
public static final Qualifier PROVENANCE_ACTION_SET_QUALIFIER = qualifier(
SYSIMPORT_ACTIONSET, SYSIMPORT_ACTIONSET, DNET_PROVENANCE_ACTIONS);
public static final String DATASET_RESULTTYPE_CLASSID = "dataset";
public static final String PUBLICATION_RESULTTYPE_CLASSID = "publication";
public static final String SOFTWARE_RESULTTYPE_CLASSID = "software";
public static final String ORP_RESULTTYPE_CLASSID = "other";
public static final String RESULT_RESULT = "resultResult"; // relType
/**
* @deprecated Use {@link ModelConstants#RELATIONSHIP} instead.
*/
@Deprecated
public static final String PUBLICATION_DATASET = "publicationDataset"; // subreltype
public static final String SUPPLEMENT = "supplement"; // subreltype
public static final String IS_SUPPLEMENT_TO = "IsSupplementTo";
public static final String IS_SUPPLEMENTED_BY = "IsSupplementedBy";
public static final String PART = "part"; // subreltype
public static final String IS_PART_OF = "IsPartOf";
public static final String HAS_PART = "HasPart";
public static final String RELATIONSHIP = "relationship"; // subreltype
public static final String IS_RELATED_TO = "IsRelatedTo";
public static final String IS_IDENTICAL_TO = "IsIdenticalTo";
public static final String REFERENCES = "References";
public static final String IS_REFERENCED_BY = "IsReferencedBy";
public static final String CONTINUES = "Continues";
public static final String IS_CONTINUED_BY = "IsContinuedBy";
public static final String DOCUMENTS = "Documents";
public static final String IS_DOCUMENTED_BY = "IsDocumentedBy";
public static final String IS_SOURCE_OF = "IsSourceOf";
public static final String IS_DERIVED_FROM = "IsDerivedFrom";
public static final String COMPILES = "Compiles";
public static final String IS_COMPILED_BY = "IsCompiledBy";
public static final String DESCRIBES = "Describes";
public static final String IS_DESCRIBED_BY = "IsDescribedBy";
public static final String IS_METADATA_FOR = "IsMetadataFor";
public static final String IS_METADATA_OF = "IsMetadataOf";
public static final String HAS_ASSOCIATION_WITH = "HasAssociationWith";
public static final String IS_REQUIRED_BY = "IsRequiredBy";
public static final String REQUIRES = "Requires";
public static final String CITATION = "citation"; // subreltype
public static final String CITES = "Cites";
public static final String IS_CITED_BY = "IsCitedBy";
public static final String REVIEW = "review"; // subreltype
public static final String REVIEWS = "Reviews";
public static final String IS_REVIEWED_BY = "IsReviewedBy";
public static final String VERSION = "version"; // subreltype
public static final String IS_VERSION_OF = "IsVersionOf";
public static final String HAS_VERSION = "HasVersion";
public static final String IS_PREVIOUS_VERSION_OF = "IsPreviousVersionOf";
public static final String IS_NEW_VERSION_OF = "IsNewVersionOf";
public static final String IS_VARIANT_FORM_OF = "IsVariantFormOf";
public static final String IS_ORIGINAL_FORM_OF = "IsOriginalFormOf";
public static final String IS_OBSOLETED_BY = "IsObsoletedBy";
public static final String OBSOLETES = "Obsoletes";
public static final String RESULT_PROJECT = "resultProject"; // relType
public static final String OUTCOME = "outcome"; // subreltype
public static final String IS_PRODUCED_BY = "isProducedBy";
public static final String PRODUCES = "produces";
public static final String DATASOURCE_ORGANIZATION = "datasourceOrganization"; // relType
public static final String PROVISION = "provision"; // subreltype
public static final String IS_PROVIDED_BY = "isProvidedBy";
public static final String PROVIDES = "provides";
public static final String PROJECT_ORGANIZATION = "projectOrganization"; // relType
public static final String PARTICIPATION = "participation"; // subreltype
public static final String HAS_PARTICIPANT = "hasParticipant";
public static final String IS_PARTICIPANT = "isParticipant";
public static final String RESULT_ORGANIZATION = "resultOrganization"; // relType
public static final String AFFILIATION = "affiliation"; // subreltype
public static final String IS_AUTHOR_INSTITUTION_OF = "isAuthorInstitutionOf";
public static final String HAS_AUTHOR_INSTITUTION = "hasAuthorInstitution";
public static final String ORG_ORG_RELTYPE = "organizationOrganization"; // relType
public static final String IS_PARENT_OF = "IsParentOf";
public static final String IS_CHILD_OF = "IsChildOf";
public static final String DEDUP = "dedup"; // subreltype
public static final String MERGES = "merges";
public static final String IS_MERGED_IN = "isMergedIn";
public static final String SIMILARITY = "similarity"; // subreltype
public static final String IS_SIMILAR_TO = "isSimilarTo";
public static final String IS_AMONG_TOP_N_SIMILAR_DOCS = "IsAmongTopNSimilarDocuments";
public static final String HAS_AMONG_TOP_N_SIMILAR_DOCS = "HasAmongTopNSimilarDocuments";
public static final String IS_DIFFERENT_FROM = "isDifferentFrom";
public static final String UNKNOWN = "UNKNOWN";
public static final String NOT_AVAILABLE = "not available";
public static final Qualifier PUBLICATION_DEFAULT_RESULTTYPE = qualifier(
PUBLICATION_RESULTTYPE_CLASSID, PUBLICATION_RESULTTYPE_CLASSID,
DNET_RESULT_TYPOLOGIES);
public static final Qualifier DATASET_DEFAULT_RESULTTYPE = qualifier(
DATASET_RESULTTYPE_CLASSID, DATASET_RESULTTYPE_CLASSID,
DNET_RESULT_TYPOLOGIES);
public static final Qualifier SOFTWARE_DEFAULT_RESULTTYPE = qualifier(
SOFTWARE_RESULTTYPE_CLASSID, SOFTWARE_RESULTTYPE_CLASSID,
DNET_RESULT_TYPOLOGIES);
public static final Qualifier ORP_DEFAULT_RESULTTYPE = qualifier(
ORP_RESULTTYPE_CLASSID, ORP_RESULTTYPE_CLASSID,
DNET_RESULT_TYPOLOGIES);
public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = qualifier(
SYSIMPORT_CROSSWALK_REPOSITORY, SYSIMPORT_CROSSWALK_REPOSITORY,
DNET_PROVENANCE_ACTIONS);

View File

@ -96,48 +96,6 @@ public class ModelSupport {
idPrefixEntity.put("50", "result");
}
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
static {
set(relationInverseMap, PROJECT_ORGANIZATION, PARTICIPATION, IS_PARTICIPANT, HAS_PARTICIPANT);
set(relationInverseMap, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF, HAS_AUTHOR_INSTITUTION);
set(relationInverseMap, ORG_ORG_RELTYPE, DEDUP, IS_MERGED_IN, MERGES);
set(relationInverseMap, ORG_ORG_RELTYPE, DEDUP, IS_SIMILAR_TO, IS_SIMILAR_TO);
set(relationInverseMap, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, PRODUCES);
set(relationInverseMap, DATASOURCE_ORGANIZATION, PROVISION, IS_PROVIDED_BY, PROVIDES);
set(relationInverseMap, RESULT_RESULT, SIMILARITY, IS_AMONG_TOP_N_SIMILAR_DOCS, HAS_AMONG_TOP_N_SIMILAR_DOCS);
set(relationInverseMap, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, IS_SUPPLEMENTED_BY);
set(relationInverseMap, RESULT_RESULT, PART, IS_PART_OF, HAS_PART);
set(relationInverseMap, RESULT_RESULT, DEDUP, IS_MERGED_IN, MERGES);
set(relationInverseMap, RESULT_RESULT, DEDUP, IS_SIMILAR_TO, IS_SIMILAR_TO);
set(relationInverseMap, RESULT_RESULT, CITATION, IS_CITED_BY, CITES);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_IDENTICAL_TO, IS_IDENTICAL_TO);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_REFERENCED_BY, REFERENCES);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_CONTINUED_BY, CONTINUES);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_DOCUMENTED_BY, DOCUMENTS);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_DERIVED_FROM, IS_SOURCE_OF);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, IS_RELATED_TO);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_COMPILED_BY, COMPILES);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_DESCRIBED_BY, DESCRIBES);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_METADATA_FOR, IS_METADATA_OF);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, HAS_ASSOCIATION_WITH, HAS_ASSOCIATION_WITH);
set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_REQUIRED_BY, REQUIRES);
set(relationInverseMap, RESULT_RESULT, VERSION, IS_PREVIOUS_VERSION_OF, IS_NEW_VERSION_OF);
set(relationInverseMap, RESULT_RESULT, VERSION, IS_VARIANT_FORM_OF, IS_ORIGINAL_FORM_OF);
set(relationInverseMap, RESULT_RESULT, VERSION, IS_OBSOLETED_BY, OBSOLETES);
set(relationInverseMap, RESULT_RESULT, VERSION, IS_VERSION_OF, HAS_VERSION);
set(relationInverseMap, RESULT_RESULT, REVIEW, IS_REVIEWED_BY, REVIEWS);
}
private static void set(Map<String, RelationInverse> relationInverseMap, String relType, String subRelType,
String relClass, String inverseRelClass) {
relationInverseMap
@ -158,35 +116,6 @@ public class ModelSupport {
}
}
/**
* Helper method: lookup relation inverse, given the direct relation encoding (case insensitive)
* @param encoding
* @return the relation inverse descriptor, throws @IllegalArgumentException when not found.
*/
public static RelationInverse findInverse(String encoding) {
return ModelSupport.relationInverseMap
.entrySet()
.stream()
.filter(r -> encoding.equalsIgnoreCase(r.getKey()))
.findFirst()
.map(r -> r.getValue())
.orElseThrow(() -> new IllegalArgumentException("invalid relationship: " + encoding));
}
/**
* Helper method: fina a relation filtering by a relation name
* @param relationName
* @return
*/
public static RelationInverse findRelation(final String relationName) {
return relationInverseMap
.values()
.stream()
.filter(r -> relationName.equalsIgnoreCase(r.getRelClass()))
.findFirst()
.orElse(null);
}
/**
* Helper method: combines the relation attributes
* @param relType
@ -364,17 +293,17 @@ public class ModelSupport {
.join(
source,
target,
relType,
subRelType,
relClass))
relType.toString(),
subRelType.toString(),
relClass.toString()))
.orElse(
String
.join(
source,
target,
relType,
subRelType)))
.orElse(String.join(source, target, relType)))
relType.toString(),
subRelType.toString())))
.orElse(String.join(source, target, relType.toString())))
.orElse(String.join(source, target)))
.orElse(source))
.orElse(null);

View File

@ -16,6 +16,8 @@ import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
@ -38,6 +40,127 @@ public class GraphCleaningFunctions extends CleaningFunctions {
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) {
if (ModelSupport.isSubClass(value, Result.class)) {
final Result res = (Result) value;
if (shouldCleanContext(res, verifyParam)) {
res
.setContext(
res
.getContext()
.stream()
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
.collect(Collectors.toList()));
}
return (T) res;
} else {
return value;
}
}
private static boolean shouldCleanContext(Result res, String verifyParam) {
boolean titleMatch = res
.getTitle()
.stream()
.filter(
t -> t
.getQualifier()
.getClassid()
.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()));
return titleMatch && Objects.nonNull(res.getContext());
}
public static <T extends Oaf> T cleanCountry(T value, String[] verifyParam, Set<String> hostedBy,
String collectedfrom, String country) {
if (ModelSupport.isSubClass(value, Result.class)) {
final Result res = (Result) value;
if (res.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
!res.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
return (T) res;
}
List<StructuredProperty> ids = getPidsAndAltIds(res).collect(Collectors.toList());
if (ids
.stream()
.anyMatch(
p -> p
.getQualifier()
.getClassid()
.equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) {
res
.setCountry(
res
.getCountry()
.stream()
.filter(
c -> toTakeCountry(c, country))
.collect(Collectors.toList()));
}
return (T) res;
} else {
return value;
}
}
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
final Stream<StructuredProperty> resultPids = Optional
.ofNullable(r.getPid())
.map(Collection::stream)
.orElse(Stream.empty());
final Stream<StructuredProperty> instancePids = Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.flatMap(
i -> Optional
.ofNullable(i.getPid())
.map(Collection::stream)
.orElse(Stream.empty())))
.orElse(Stream.empty());
final Stream<StructuredProperty> instanceAltIds = Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.flatMap(
i -> Optional
.ofNullable(i.getAlternateIdentifier())
.map(Collection::stream)
.orElse(Stream.empty())))
.orElse(Stream.empty());
return Stream
.concat(
Stream.concat(resultPids, instancePids),
instanceAltIds);
}
private static boolean pidInParam(String value, String[] verifyParam) {
for (String s : verifyParam)
if (value.startsWith(s))
return true;
return false;
}
private static boolean toTakeCountry(Country c, String country) {
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
// inserted via propagation
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
return true;
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
return true;
return !(c
.getClassid()
.equalsIgnoreCase(country) &&
c.getDataInfo().getInferenceprovenance().equals("propagation"));
}
public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) {
// nothing to clean here

View File

@ -18,443 +18,443 @@ import eu.dnetlib.dhp.schema.oaf.common.AccessRightComparator;
public class OafMapperUtils {
private OafMapperUtils() {
}
private OafMapperUtils() {
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) {
throw new IllegalArgumentException("Invalid number of parameters (k,v,k,v,....)");
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) {
throw new IllegalArgumentException("Invalid number of parameters (k,v,k,v,....)");
}
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> List<T> listValues(Array values) throws SQLException {
if (Objects.isNull(values)) {
return null;
}
return Arrays
.stream((T[]) values.getArray())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toList());
}
public static <T> List<T> listValues(Array values) throws SQLException {
if (Objects.isNull(values)) {
return null;
}
return Arrays
.stream((T[]) values.getArray())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toList());
}
public static Qualifier unknown(final String schemeid) {
return qualifier(UNKNOWN, "Unknown", schemeid);
}
public static Qualifier unknown(final String schemeid) {
return qualifier(UNKNOWN, "Unknown", schemeid);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid) {
return accessRight(classid, classname, schemeid, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid) {
return accessRight(classid, classname, schemeid, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static Qualifier qualifier(
final String classid,
final String classname,
final String schemeid) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
return q;
}
public static Qualifier qualifier(
final String classid,
final String classname,
final String schemeid) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
return q;
}
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
return q;
}
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
return q;
}
public static Subject subject(
final String value,
final String classid,
final String classname,
final String schemeid,
final DataInfo dataInfo) {
public static Subject subject(
final String value,
final String classid,
final String classname,
final String schemeid,
final DataInfo dataInfo) {
return subject(value, qualifier(classid, classname, schemeid), dataInfo);
}
return subject(value, qualifier(classid, classname, schemeid), dataInfo);
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
final String classname,
final String schemeid) {
public static StructuredProperty structuredProperty(
final String value,
final String classid,
final String classname,
final String schemeid) {
return structuredProperty(value, qualifier(classid, classname, schemeid));
}
return structuredProperty(value, qualifier(classid, classname, schemeid));
}
public static Subject subject(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final Subject s = new Subject();
s.setValue(value);
s.setQualifier(qualifier);
s.setDataInfo(dataInfo);
return s;
}
public static Subject subject(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final Subject s = new Subject();
s.setValue(value);
s.setQualifier(qualifier);
s.setDataInfo(dataInfo);
return s;
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier) {
if (value == null) {
return null;
}
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
return sp;
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier) {
if (value == null) {
return null;
}
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
return sp;
}
public static Publisher publisher(final String name) {
final Publisher p = new Publisher();
p.setName(name);
return p;
}
public static Publisher publisher(final String name) {
final Publisher p = new Publisher();
p.setName(name);
return p;
}
public static License license(final String url) {
final License l = new License();
l.setUrl(url);
return l;
}
public static License license(final String url) {
final License l = new License();
l.setUrl(url);
return l;
}
public static AuthorPid authorPid(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final AuthorPid ap = new AuthorPid();
ap.setValue(value);
ap.setQualifier(qualifier);
ap.setDataInfo(dataInfo);
return ap;
}
public static AuthorPid authorPid(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final AuthorPid ap = new AuthorPid();
ap.setValue(value);
ap.setQualifier(qualifier);
ap.setDataInfo(dataInfo);
return ap;
}
public static AuthorPid authorPid(
final String value,
final String classid,
final String schemeid,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final AuthorPid ap = new AuthorPid();
ap.setValue(value);
ap.setQualifier(qualifier(classid, classid, schemeid));
ap.setDataInfo(dataInfo);
return ap;
}
public static AuthorPid authorPid(
final String value,
final String classid,
final String schemeid,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final AuthorPid ap = new AuthorPid();
ap.setValue(value);
ap.setQualifier(qualifier(classid, classid, schemeid));
ap.setDataInfo(dataInfo);
return ap;
}
public static ExtraInfo extraInfo(
final String name,
final String value,
final String typology,
final String provenance,
final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static ExtraInfo extraInfo(
final String name,
final String value,
final String typology,
final String provenance,
final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(
final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
public static OAIProvenance oaiIProvenance(
final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
return p;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking) {
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking) {
return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal(
name,
issnPrinted,
issnOnline,
issnLinking,
null,
null,
null,
null,
null,
null,
null) : null;
}
return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal(
name,
issnPrinted,
issnOnline,
issnLinking,
null,
null,
null,
null,
null,
null,
null) : null;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate) {
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate) {
if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
return j;
} else {
return null;
}
}
if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
return j;
} else {
return null;
}
}
private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) {
return StringUtils.isNotBlank(issnPrinted)
|| StringUtils.isNotBlank(issnOnline)
|| StringUtils.isNotBlank(issnLinking);
}
private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) {
return StringUtils.isNotBlank(issnPrinted)
|| StringUtils.isNotBlank(issnOnline)
|| StringUtils.isNotBlank(issnLinking);
}
public static DataInfo dataInfo(
final float trust,
final String inferenceprovenance,
final boolean inferred,
final Qualifier provenanceaction) {
final DataInfo d = new DataInfo();
d.setTrust(trust);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setProvenanceaction(provenanceaction);
return d;
}
public static DataInfo dataInfo(
final float trust,
final String inferenceprovenance,
final boolean inferred,
final Qualifier provenanceaction) {
final DataInfo d = new DataInfo();
d.setTrust(trust);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setProvenanceaction(provenanceaction);
return d;
}
public static EntityDataInfo dataInfo(
final boolean invisible,
final boolean deletedbyinference,
final float trust,
final String inferenceprovenance,
final boolean inferred,
final Qualifier provenanceaction) {
final EntityDataInfo d = new EntityDataInfo();
d.setTrust(trust);
d.setInvisible(invisible);
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setProvenanceaction(provenanceaction);
return d;
}
public static EntityDataInfo dataInfo(
final boolean invisible,
final boolean deletedbyinference,
final float trust,
final String inferenceprovenance,
final boolean inferred,
final Qualifier provenanceaction) {
final EntityDataInfo d = new EntityDataInfo();
d.setTrust(trust);
d.setInvisible(invisible);
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setProvenanceaction(provenanceaction);
return d;
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
public static <T> Predicate<T> distinctByKey(
final Function<? super T, ?> keyExtractor) {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
}
public static <T> Predicate<T> distinctByKey(
final Function<? super T, ?> keyExtractor) {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
}
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(Instance::getAccessright)
.min(new AccessRightComparator<>());
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(Instance::getAccessright)
.min(new AccessRightComparator<>());
final Qualifier rights = min.map(OafMapperUtils::qualifier).orElseGet(Qualifier::new);
final Qualifier rights = min.map(OafMapperUtils::qualifier).orElseGet(Qualifier::new);
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
return rights;
}
return null;
}
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
Measure m = new Measure();
m.setId(id);
m.setUnit(Arrays.asList(unit(key, value, dataInfo)));
return m;
}
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
Measure m = new Measure();
m.setId(id);
m.setUnit(Arrays.asList(unit(key, value, dataInfo)));
return m;
}
public static MeasureUnit unit(String key, String value, DataInfo dataInfo) {
MeasureUnit unit = new MeasureUnit();
unit.setKey(key);
unit.setValue(value);
unit.setDataInfo(dataInfo);
return unit;
}
public static MeasureUnit unit(String key, String value, DataInfo dataInfo) {
MeasureUnit unit = new MeasureUnit();
unit.setKey(key);
unit.setValue(value);
unit.setDataInfo(dataInfo);
return unit;
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final Entity entity) {
return getRelation(source, target, relType, subRelType, relClass, entity, null);
}
public static Relation getRelation(final String source,
final String target,
final Relation.RELTYPE relType,
final Relation.SUBRELTYPE subRelType,
final Relation.RELCLASS relClass,
final Entity entity) {
return getRelation(source, target, relType, subRelType, relClass, entity, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final Entity entity,
final String validationDate) {
public static Relation getRelation(final String source,
final String target,
final Relation.RELTYPE relType,
final Relation.SUBRELTYPE subRelType,
final Relation.RELCLASS relClass,
final Entity entity,
final String validationDate) {
final List<Provenance> provenance = getProvenance(
entity.getCollectedfrom(), fromEntityDataInfo(entity.getDataInfo()));
return getRelation(
source, target, relType, subRelType, relClass, provenance, validationDate, null);
}
final List<Provenance> provenance = getProvenance(
entity.getCollectedfrom(), fromEntityDataInfo(entity.getDataInfo()));
return getRelation(
source, target, relType, subRelType, relClass, provenance, validationDate, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<Provenance> provenance) {
return getRelation(
source, target, relType, subRelType, relClass, provenance, null, null);
}
public static Relation getRelation(final String source,
final String target,
final Relation.RELTYPE relType,
final Relation.SUBRELTYPE subRelType,
final Relation.RELCLASS relClass,
final List<Provenance> provenance) {
return getRelation(
source, target, relType, subRelType, relClass, provenance, null, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<Provenance> provenance,
final List<KeyValue> properties) {
return getRelation(
source, target, relType, subRelType, relClass, provenance, null, properties);
}
public static Relation getRelation(final String source,
final String target,
final Relation.RELTYPE relType,
final Relation.SUBRELTYPE subRelType,
final Relation.RELCLASS relClass,
final List<Provenance> provenance,
final List<KeyValue> properties) {
return getRelation(
source, target, relType, subRelType, relClass, provenance, null, properties);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<Provenance> provenance,
final String validationDate,
final List<KeyValue> properties) {
final Relation rel = new Relation();
rel.setRelType(relType);
rel.setSubRelType(subRelType);
rel.setRelClass(relClass);
rel.setSource(source);
rel.setTarget(target);
rel.setProvenance(provenance);
rel.setValidated(StringUtils.isNotBlank(validationDate));
rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
rel.setProperties(properties);
return rel;
}
public static Relation getRelation(final String source,
final String target,
final Relation.RELTYPE relType,
final Relation.SUBRELTYPE subRelType,
final Relation.RELCLASS relClass,
final List<Provenance> provenance,
final String validationDate,
final List<KeyValue> properties) {
final Relation rel = new Relation();
rel.setRelType(relType);
rel.setSubRelType(subRelType);
rel.setRelClass(relClass);
rel.setSource(source);
rel.setTarget(target);
rel.setProvenance(provenance);
rel.setValidated(StringUtils.isNotBlank(validationDate));
rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
rel.setProperties(properties);
return rel;
}
public static List<Provenance> getProvenance(final List<KeyValue> collectedfrom, final DataInfo dataInfo) {
return collectedfrom
.stream()
.map(cf -> getProvenance(cf, dataInfo))
.collect(Collectors.toList());
}
public static List<Provenance> getProvenance(final List<KeyValue> collectedfrom, final DataInfo dataInfo) {
return collectedfrom
.stream()
.map(cf -> getProvenance(cf, dataInfo))
.collect(Collectors.toList());
}
public static Provenance getProvenance(final KeyValue collectedfrom, final DataInfo dataInfo) {
final Provenance prov = new Provenance();
prov.setCollectedfrom(collectedfrom);
prov.setDataInfo(dataInfo);
return prov;
}
public static Provenance getProvenance(final KeyValue collectedfrom, final DataInfo dataInfo) {
final Provenance prov = new Provenance();
prov.setCollectedfrom(collectedfrom);
prov.setDataInfo(dataInfo);
return prov;
}
public static String getProvenance(DataInfo dataInfo) {
return Optional
.ofNullable(dataInfo)
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
}
public static String getProvenance(DataInfo dataInfo) {
return Optional
.ofNullable(dataInfo)
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
}
public static DataInfo fromEntityDataInfo(EntityDataInfo entityDataInfo) {
DataInfo dataInfo = new DataInfo();
dataInfo.setTrust(entityDataInfo.getTrust());
dataInfo.setInferenceprovenance(entityDataInfo.getInferenceprovenance());
dataInfo.setInferred(entityDataInfo.getInferred());
dataInfo.setProvenanceaction(entityDataInfo.getProvenanceaction());
return dataInfo;
}
public static DataInfo fromEntityDataInfo(EntityDataInfo entityDataInfo) {
DataInfo dataInfo = new DataInfo();
dataInfo.setTrust(entityDataInfo.getTrust());
dataInfo.setInferenceprovenance(entityDataInfo.getInferenceprovenance());
dataInfo.setInferred(entityDataInfo.getInferred());
dataInfo.setProvenanceaction(entityDataInfo.getProvenanceaction());
return dataInfo;
}
}

View File

@ -34,34 +34,34 @@ public class ResultTypeComparator implements Comparator<Result> {
return 1;
}
String lClass = left.getResulttype();
String rClass = right.getResulttype();
Result.RESULTTYPE lType = left.getResulttype();
Result.RESULTTYPE rType = right.getResulttype();
if (lClass.equals(rClass))
if (lType.equals(rType))
return 0;
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
if (lType.equals(Result.RESULTTYPE.publication))
return -1;
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
if (rType.equals(Result.RESULTTYPE.publication))
return 1;
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
if (lType.equals(Result.RESULTTYPE.dataset))
return -1;
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
if (rType.equals(Result.RESULTTYPE.dataset))
return 1;
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
if (lType.equals(Result.RESULTTYPE.software))
return -1;
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
if (rType.equals(Result.RESULTTYPE.software))
return 1;
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
if (lType.equals(Result.RESULTTYPE.otherresearchproduct))
return -1;
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
if (rType.equals(Result.RESULTTYPE.otherresearchproduct))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
return lType.compareTo(rType);
}
protected HashSet<String> getCollectedFromIds(Result left) {

View File

@ -1,158 +0,0 @@
{
"cites":{
"original":"Cites",
"inverse":"IsCitedBy"
},
"compiles":{
"original":"Compiles",
"inverse":"IsCompiledBy"
},
"continues":{
"original":"Continues",
"inverse":"IsContinuedBy"
},
"derives":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"describes":{
"original":"Describes",
"inverse":"IsDescribedBy"
},
"documents":{
"original":"Documents",
"inverse":"IsDocumentedBy"
},
"hasmetadata":{
"original":"HasMetadata",
"inverse":"IsMetadataOf"
},
"hasassociationwith":{
"original":"HasAssociationWith",
"inverse":"HasAssociationWith"
},
"haspart":{
"original":"HasPart",
"inverse":"IsPartOf"
},
"hasversion":{
"original":"HasVersion",
"inverse":"IsVersionOf"
},
"iscitedby":{
"original":"IsCitedBy",
"inverse":"Cites"
},
"iscompiledby":{
"original":"IsCompiledBy",
"inverse":"Compiles"
},
"iscontinuedby":{
"original":"IsContinuedBy",
"inverse":"Continues"
},
"isderivedfrom":{
"original":"IsDerivedFrom",
"inverse":"IsSourceOf"
},
"isdescribedby":{
"original":"IsDescribedBy",
"inverse":"Describes"
},
"isdocumentedby":{
"original":"IsDocumentedBy",
"inverse":"Documents"
},
"isidenticalto":{
"original":"IsIdenticalTo",
"inverse":"IsIdenticalTo"
},
"ismetadatafor":{
"original":"IsMetadataFor",
"inverse":"IsMetadataOf"
},
"ismetadataof":{
"original":"IsMetadataOf",
"inverse":"IsMetadataFor"
},
"isnewversionof":{
"original":"IsNewVersionOf",
"inverse":"IsPreviousVersionOf"
},
"isobsoletedby":{
"original":"IsObsoletedBy",
"inverse":"Obsoletes"
},
"isoriginalformof":{
"original":"IsOriginalFormOf",
"inverse":"IsVariantFormOf"
},
"ispartof":{
"original":"IsPartOf",
"inverse":"HasPart"
},
"ispreviousversionof":{
"original":"IsPreviousVersionOf",
"inverse":"IsNewVersionOf"
},
"isreferencedby":{
"original":"IsReferencedBy",
"inverse":"References"
},
"isrelatedto":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"isrequiredby":{
"original":"IsRequiredBy",
"inverse":"Requires"
},
"isreviewedby":{
"original":"IsReviewedBy",
"inverse":"Reviews"
},
"issourceof":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"issupplementedby":{
"original":"IsSupplementedBy",
"inverse":"IsSupplementTo"
},
"issupplementto":{
"original":"IsSupplementTo",
"inverse":"IsSupplementedBy"
},
"isvariantformof":{
"original":"IsVariantFormOf",
"inverse":"IsOriginalFormOf"
},
"isversionof":{
"original":"IsVersionOf",
"inverse":"HasVersion"
},
"obsoletes":{
"original":"Obsoletes",
"inverse":"IsObsoletedBy"
},
"references":{
"original":"References",
"inverse":"IsReferencedBy"
},
"requires":{
"original":"Requires",
"inverse":"IsRequiredBy"
},
"related":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"reviews":{
"original":"Reviews",
"inverse":"IsReviewedBy"
},
"unknown":{
"original":"Unknown",
"inverse":"Unknown"
}
}

View File

@ -22,19 +22,6 @@ object ScholixUtils extends Serializable {
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
val relations: Map[String, RelationVocabulary] = {
val input = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
)
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
json.extract[Map[String, RelationVocabulary]]
}
def extractRelationDate(relation: Relation): String = {
if (relation.getProperties == null || !relation.getProperties.isEmpty)
@ -288,11 +275,8 @@ object ScholixUtils extends Serializable {
s.setPublisher(source.getPublisher)
}
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
new ScholixRelationship(relation.getRelClass.toString, "datacite", relation.getRelClass.getInverse.toString)
)
s.setSource(source)
@ -330,12 +314,10 @@ object ScholixUtils extends Serializable {
s.setPublisher(l.asJava)
}
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
new ScholixRelationship(relation.getRelClass.toString, "datacite", relation.getRelClass.getInverse.toString)
)
s.setSource(generateScholixResourceFromSummary(source))
s

View File

@ -44,7 +44,7 @@ public class MergeUtilsTest {
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
final Result p1d2 = MergeUtils.merge(p1, d2);
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype());
assertEquals(Result.RESULTTYPE.publication, p1d2.getResulttype());
assertTrue(p1d2 instanceof Publication);
assertEquals(p1.getId(), p1d2.getId());
}
@ -55,7 +55,7 @@ public class MergeUtilsTest {
Dataset d1 = read("dataset_1.json", Dataset.class);
final Result p2d1 = MergeUtils.merge(p2, d1);
assertEquals(ModelConstants.DATASET_RESULTTYPE_CLASSID, p2d1.getResulttype());
assertEquals(Result.RESULTTYPE.dataset, p2d1.getResulttype());
assertTrue(p2d1 instanceof Dataset);
assertEquals(d1.getId(), p2d1.getId());
assertEquals(2, p2d1.getCollectedfrom().size());

View File

@ -18,30 +18,21 @@ object CollectionUtils {
def fixRelations(i: Oaf): List[Oaf] = {
if (i.isInstanceOf[Entity])
return List(i)
List(i)
else {
val r: Relation = i.asInstanceOf[Relation]
val currentRel = ModelSupport.findRelation(r.getRelClass)
if (currentRel != null) {
// Cleaning relation
r.setRelType(currentRel.getRelType)
r.setSubRelType(currentRel.getSubReltype)
r.setRelClass(currentRel.getRelClass)
val inverse = new Relation
inverse.setSource(r.getTarget)
inverse.setTarget(r.getSource)
inverse.setRelType(currentRel.getRelType)
inverse.setSubRelType(currentRel.getSubReltype)
inverse.setRelClass(currentRel.getInverseRelClass)
inverse.setProvenance(r.getProvenance)
inverse.setProperties(r.getProperties)
inverse.setValidated(r.getValidated)
inverse.setValidationDate(r.getValidationDate)
return List(r, inverse)
}
val inverse = new Relation
inverse.setSource(r.getTarget)
inverse.setTarget(r.getSource)
inverse.setRelType(r.getRelType)
inverse.setSubRelType(r.getSubRelType)
inverse.setRelClass(r.getRelClass.getInverse)
inverse.setProvenance(r.getProvenance)
inverse.setProperties(r.getProperties)
inverse.setValidated(r.getValidated)
inverse.setValidationDate(r.getValidationDate)
List(r, inverse)
}
List()
}
def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = {

View File

@ -61,13 +61,15 @@ object CrossrefUtility {
resultList
}
private def createRelation(sourceId: String, targetId: String, relClass: String): Relation = {
private def createRelation(sourceId: String, targetId: String, relClass: Relation.RELCLASS): Relation = {
val r = new Relation
//TODO further inspect
r.setSource(sourceId)
r.setTarget(targetId)
r.setRelType(ModelConstants.RESULT_PROJECT)
r.setRelType(Relation.RELTYPE.resultProject)
r.setRelClass(relClass)
r.setSubRelType(ModelConstants.OUTCOME)
r.setSubRelType(Relation.SUBRELTYPE.outcome)
r.setProvenance(List(OafMapperUtils.getProvenance(CROSSREF_COLLECTED_FROM, null)).asJava)
r
}
@ -84,7 +86,7 @@ object CrossrefUtility {
.filter(a => a != null && a.nonEmpty)
.map(award => {
val targetId = IdentifierFactory.createOpenaireId("project", s"$nsPrefix::$award", true)
createRelation(targetId, source.getId, ModelConstants.PRODUCES)
createRelation(targetId, source.getId, Relation.RELCLASS.produces)
})
else List()
}
@ -132,15 +134,15 @@ object CrossrefUtility {
case "10.13039/501100000038" =>
val targetId =
IdentifierFactory.createOpenaireId("project", "nserc_______::1e5e62235d094afd01cd56e65112fc63", false)
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
case "10.13039/501100000155" =>
val targetId =
IdentifierFactory.createOpenaireId("project", "sshrc_______::1e5e62235d094afd01cd56e65112fc63", false)
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
case "10.13039/501100000024" =>
val targetId =
IdentifierFactory.createOpenaireId("project", "cihr________::1e5e62235d094afd01cd56e65112fc63", false)
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
case "10.13039/501100002848" =>
relList = relList ::: generateSimpleRelationFromAward(funder, "conicytf____", a => a, result)
case "10.13039/501100003448" =>
@ -153,7 +155,7 @@ object CrossrefUtility {
relList = relList ::: generateSimpleRelationFromAward(funder, "miur________", a => a, result)
val targetId =
IdentifierFactory.createOpenaireId("project", "miur________::1e5e62235d094afd01cd56e65112fc63", false)
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
case "10.13039/501100006588" | "10.13039/501100004488" =>
relList = relList ::: generateSimpleRelationFromAward(
funder,
@ -171,7 +173,7 @@ object CrossrefUtility {
relList = relList ::: generateSimpleRelationFromAward(funder, "wt__________", a => a, result)
val targetId =
IdentifierFactory.createOpenaireId("project", "wt__________::1e5e62235d094afd01cd56e65112fc63", false)
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
case _ => logger.debug("no match for " + funder.DOI.get)
}
@ -191,7 +193,7 @@ object CrossrefUtility {
relList = relList ::: generateSimpleRelationFromAward(funder, "wt__________", a => a, result)
val targetId =
IdentifierFactory.createOpenaireId("project", "wt__________::1e5e62235d094afd01cd56e65112fc63", false)
relList = relList ::: List(createRelation(targetId, result.getId, ModelConstants.PRODUCES))
relList = relList ::: List(createRelation(targetId, result.getId, Relation.RELCLASS.produces))
case _ => logger.debug("no match for " + funder.name)
}

View File

@ -78,124 +78,6 @@ object DataciteModelConstants {
val DATACITE_COLLECTED_FROM: KeyValue =
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
val subRelTypeMapping: Map[String, OAFRelations] = Map(
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.SUPPLEMENT
),
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.SUPPLEMENT
),
ModelConstants.HAS_PART -> OAFRelations(
ModelConstants.HAS_PART,
ModelConstants.IS_PART_OF,
ModelConstants.PART
),
ModelConstants.IS_PART_OF -> OAFRelations(
ModelConstants.IS_PART_OF,
ModelConstants.HAS_PART,
ModelConstants.PART
),
ModelConstants.IS_VERSION_OF -> OAFRelations(
ModelConstants.IS_VERSION_OF,
ModelConstants.HAS_VERSION,
ModelConstants.VERSION
),
ModelConstants.HAS_VERSION -> OAFRelations(
ModelConstants.HAS_VERSION,
ModelConstants.IS_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
ModelConstants.IS_CONTINUED_BY,
ModelConstants.CONTINUES,
ModelConstants.RELATIONSHIP
),
ModelConstants.CONTINUES -> OAFRelations(
ModelConstants.CONTINUES,
ModelConstants.IS_CONTINUED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_SOURCE_OF -> OAFRelations(
ModelConstants.IS_SOURCE_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
ModelConstants.IS_DERIVED_FROM,
ModelConstants.IS_SOURCE_OF,
ModelConstants.VERSION
),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
ModelConstants.IS_VARIANT_FORM_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
ModelConstants.IS_OBSOLETED_BY,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.REVIEWS -> OAFRelations(
ModelConstants.REVIEWS,
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEW
),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEWS,
ModelConstants.REVIEW
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.COMPILES -> OAFRelations(
ModelConstants.COMPILES,
ModelConstants.IS_COMPILED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_COMPILED_BY -> OAFRelations(
ModelConstants.IS_COMPILED_BY,
ModelConstants.COMPILES,
ModelConstants.RELATIONSHIP
)
)
val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
require(stream != null)

View File

@ -286,7 +286,7 @@ object DataciteToOAFTransformation {
def generateRelation(
sourceId: String,
targetId: String,
relClass: String,
relClass: Relation.RELCLASS,
collectedFrom: KeyValue,
di: DataInfo
): Relation = {
@ -294,9 +294,9 @@ object DataciteToOAFTransformation {
val r = new Relation
r.setSource(sourceId)
r.setTarget(targetId)
r.setRelType(ModelConstants.RESULT_PROJECT)
r.setRelType(Relation.RELTYPE.resultProject)
r.setRelClass(relClass)
r.setSubRelType(ModelConstants.OUTCOME)
r.setSubRelType(Relation.SUBRELTYPE.outcome)
r.setProvenance(Lists.newArrayList(OafMapperUtils.getProvenance(collectedFrom, di)))
r
}
@ -309,7 +309,7 @@ object DataciteToOAFTransformation {
val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, relDataInfo))
List(generateRelation(sourceId, targetId, Relation.RELCLASS.isProducedBy, DATACITE_COLLECTED_FROM, relDataInfo))
} else
List()
@ -622,8 +622,7 @@ object DataciteToOAFTransformation {
): List[Relation] = {
val bidirectionalRels: List[Relation] = rels
.filter(r =>
subRelTypeMapping
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
Relation.RELCLASS.exists(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
)

View File

@ -1,122 +0,0 @@
package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
public class CleanContextSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CleanContextSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CleanContextSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_context_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
String contextId = parser.get("contextId");
log.info("contextId: {}", contextId);
String verifyParam = parser.get("verifyParam");
log.info("verifyParam: {}", verifyParam);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingDir);
});
}
private static <T extends Result> void cleanContext(SparkSession spark, String contextId, String verifyParam,
String inputPath, Class<T> entityClazz, String workingDir) {
Dataset<T> res = spark
.read()
.textFile(inputPath)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz));
res.map((MapFunction<T, T>) r -> {
if (!r
.getTitle()
.stream()
.filter(
t -> t
.getQualifier()
.getClassid()
.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()))) {
return r;
}
r
.setContext(
r
.getContext()
.stream()
.filter(
c -> !c.getId().split("::")[0]
.equalsIgnoreCase(contextId))
.collect(Collectors.toList()));
return r;
}, Encoders.bean(entityClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir);
spark
.read()
.textFile(workingDir)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
}
}

View File

@ -3,12 +3,20 @@ package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import java.util.*;
import java.util.stream.Stream;
import com.google.common.collect.Iterators;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.common.ModelSupport;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
@ -20,12 +28,16 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Entity;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class CleanGraphSparkJob {
@ -33,79 +45,293 @@ public class CleanGraphSparkJob {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
private ArgumentApplicationParser parser;
public CleanGraphSparkJob(ArgumentApplicationParser parser) {
this.parser = parser;
}
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CleanGraphSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
.toString(
CleanGraphSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
new CleanGraphSparkJob(parser).run(isSparkSessionManaged, isLookup);
}
public void run(Boolean isSparkSessionManaged, ISLookUpService isLookUpService)
throws ISLookUpException, ClassNotFoundException {
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
String contextId = parser.get("contextId");
log.info("contextId: {}", contextId);
String verifyParam = parser.get("verifyParam");
log.info("verifyParam: {}", verifyParam);
String datasourcePath = parser.get("hostedBy");
log.info("datasourcePath: {}", datasourcePath);
String country = parser.get("country");
log.info("country: {}", country);
String[] verifyCountryParam = Optional
.ofNullable(parser.get("verifyCountryParam"))
.map(s -> s.split(";"))
.orElse(new String[] {});
log.info("verifyCountryParam: {}", verifyCountryParam);
String collectedfrom = parser.get("collectedfrom");
log.info("collectedfrom: {}", collectedfrom);
String dsMasterDuplicatePath = parser.get("masterDuplicatePath");
log.info("masterDuplicatePath: {}", dsMasterDuplicatePath);
Boolean deepClean = Optional
.ofNullable(parser.get("deepClean"))
.map(Boolean::valueOf)
.orElse(Boolean.FALSE);
log.info("deepClean: {}", deepClean);
Class<? extends Entity> entityClazz = (Class<? extends Entity>) Class.forName(graphTableClassName);
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
SparkConf conf = new SparkConf();
conf.setAppName(CleanGraphSparkJob.class.getSimpleName() + "#" + entityClazz.getSimpleName());
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
cleanGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
});
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
cleanGraphTable(
spark, vocs, inputPath, entityClazz, outputPath, contextId, verifyParam, datasourcePath, country,
verifyCountryParam, collectedfrom, dsMasterDuplicatePath, deepClean);
});
}
private static <T extends Oaf> void cleanGraphTable(
SparkSession spark,
VocabularyGroup vocs,
String inputPath,
Class<T> clazz,
String outputPath) {
SparkSession spark,
VocabularyGroup vocs,
String inputPath,
Class<T> clazz,
String outputPath, String contextId, String verifyParam, String datasourcePath, String country,
String[] verifyCountryParam, String collectedfrom, String dsMasterDuplicatePath,
Boolean deepClean) {
final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
readTableFromPath(spark, inputPath, clazz)
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
.filter((FilterFunction<T>) GraphCleaningFunctions::filter)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
final Dataset<T> cleaned_basic = readTableFromPath(spark, inputPath, clazz)
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
.filter((FilterFunction<T>) GraphCleaningFunctions::filter);
// read the master-duplicate tuples
Dataset<MasterDuplicate> md = spark
.read()
.textFile(dsMasterDuplicatePath)
.map(as(MasterDuplicate.class), Encoders.bean(MasterDuplicate.class));
// prepare the resolved CF|HB references with the corresponding EMPTY master ID
Dataset<IdCfHbMapping> resolved = spark
.read()
.textFile(inputPath)
.map(as(clazz), Encoders.bean(clazz))
.flatMap(flattenCfHbFn(), Encoders.bean(IdCfHbMapping.class));
if (Boolean.FALSE.equals(deepClean)) {
if (Boolean.TRUE.equals(ModelSupport.isSubClass(clazz, Result.class))) {
save(fixCFHB(clazz, cleaned_basic, md, resolved), outputPath);
} else {
save(cleaned_basic, outputPath);
}
} else if (Boolean.TRUE.equals(ModelSupport.isSubClass(clazz, Result.class))) {
// load the hostedby mapping
Set<String> hostedBy = Sets
.newHashSet(
spark
.read()
.textFile(datasourcePath)
.collectAsList());
// perform the deep cleaning steps
final Dataset<T> cleaned_deep = fixCFHB(clazz, cleaned_basic, md, resolved)
.map(
(MapFunction<T, T>) value -> GraphCleaningFunctions.cleanContext(value, contextId, verifyParam),
Encoders.bean(clazz))
.map(
(MapFunction<T, T>) value -> GraphCleaningFunctions
.cleanCountry(value, verifyCountryParam, hostedBy, collectedfrom, country),
Encoders.bean(clazz));
save(cleaned_deep, outputPath);
} else {
save(cleaned_basic, outputPath);
}
}
private static <T extends Oaf> void save(final Dataset<T> dataset, final String outputPath) {
dataset
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static <T extends Oaf> Dataset<T> fixCFHB(Class<T> clazz, Dataset<T> results, Dataset<MasterDuplicate> md,
Dataset<IdCfHbMapping> resolved) {
// set the EMPTY master ID/NAME
Dataset<IdCfHbMapping> resolvedDs = resolved
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicateId")))
.map(asIdCfHbMapping(), Encoders.bean(IdCfHbMapping.class))
.filter((FilterFunction<IdCfHbMapping>) m -> Objects.nonNull(m.getMasterId()));
return results
.joinWith(resolvedDs, results.col("id").equalTo(resolvedDs.col("resultId")), "left")
.groupByKey(
(MapFunction<Tuple2<T, IdCfHbMapping>, String>) t -> ((Result) t._1()).getId(), Encoders.STRING())
.mapGroups(getMapGroupsFunction(), Encoders.bean(clazz));
}
private static <T extends Oaf> Dataset<T> readTableFromPath(
SparkSession spark, String inputEntityPath, Class<T> clazz) {
SparkSession spark, String inputEntityPath, Class<T> clazz) {
log.info("Reading Graph table from: {}", inputEntityPath);
return spark
.read()
.textFile(inputEntityPath)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz),
Encoders.bean(clazz));
.read()
.textFile(inputEntityPath)
.map(as(clazz), Encoders.bean(clazz));
}
private static <R> MapFunction<String, R> as(Class<R> clazz) {
return s -> OBJECT_MAPPER.readValue(s, clazz);
}
private static <T extends Oaf> FlatMapFunction<T, IdCfHbMapping> flattenCfHbFn() {
return t -> {
if (!ModelSupport.isSubClass(t, Result.class)) {
return Iterators.emptyIterator();
}
final Result r = (Result) t;
return Stream
.concat(
Optional
.ofNullable(r.getCollectedfrom())
.map(cf -> cf.stream().map(KeyValue::getKey))
.orElse(Stream.empty()),
Stream
.concat(
Optional
.ofNullable(r.getInstance())
.map(
instances -> instances
.stream()
.map(i -> Optional.ofNullable(i.getHostedby()).map(KeyValue::getKey).orElse("")))
.orElse(Stream.empty())
.filter(StringUtils::isNotBlank),
Optional
.ofNullable(r.getInstance())
.map(
instances -> instances
.stream()
.map(
i -> Optional
.ofNullable(i.getCollectedfrom())
.map(KeyValue::getKey)
.orElse("")))
.orElse(Stream.empty())
.filter(StringUtils::isNotBlank)))
.distinct()
.filter(StringUtils::isNotBlank)
.map(cfHb -> asIdCfHbMapping(r.getId(), cfHb))
.iterator();
};
}
private static MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping> asIdCfHbMapping() {
return t -> {
final IdCfHbMapping mapping = t._1();
Optional
.ofNullable(t._2())
.ifPresent(t2 -> {
mapping.setMasterId(t2.getMasterId());
mapping.setMasterName(t2.getMasterName());
});
return mapping;
};
}
private static IdCfHbMapping asIdCfHbMapping(String resultId, String cfHb) {
IdCfHbMapping m = new IdCfHbMapping(resultId);
m.setCfhb(cfHb);
return m;
}
private static <T extends Oaf> MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T> getMapGroupsFunction() {
return new MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T>() {
@Override
public T call(String key, Iterator<Tuple2<T, IdCfHbMapping>> values) {
final Tuple2<T, IdCfHbMapping> first = values.next();
final T res = first._1();
updateResult(res, first._2());
values.forEachRemaining(t -> updateResult(res, t._2()));
return res;
}
private void updateResult(T t, IdCfHbMapping m) {
if (Objects.nonNull(m) && (ModelSupport.isSubClass(t, Result.class))) {
final Result res = (Result) t;
filter(res.getCollectedfrom()).forEach(kv -> updateKeyValue(kv, m));
res.getInstance().forEach(i -> {
updateKeyValue(i.getHostedby(), m);
updateKeyValue(i.getCollectedfrom(), m);
});
}
}
private Stream<KeyValue> filter(List<KeyValue> kvs) {
return kvs
.stream()
.filter(kv -> StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue()));
}
private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
if (Objects.nonNull(kv) && Objects.nonNull(kv.getKey()) && kv.getKey().equals(a.getCfhb())) {
kv.setKey(a.getMasterId());
kv.setValue(a.getMasterName());
}
}
};
}
}

View File

@ -1,10 +1,9 @@
package eu.dnetlib.dhp.oa.graph.clean.country;
package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
@ -21,7 +20,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable;

View File

@ -1,227 +0,0 @@
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Aggregator;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class CleanCfHbSparkJob {
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CleanCountrySparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_cfhb_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String resolvedPath = parser.get("resolvedPath");
log.info("resolvedPath: {}", resolvedPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String dsMasterDuplicatePath = parser.get("masterDuplicatePath");
log.info("masterDuplicatePath: {}", dsMasterDuplicatePath);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
HdfsSupport.remove(resolvedPath, spark.sparkContext().hadoopConfiguration());
cleanCfHb(
spark, inputPath, entityClazz, resolvedPath, dsMasterDuplicatePath, outputPath);
});
}
private static <T extends Result> void cleanCfHb(SparkSession spark, String inputPath, Class<T> entityClazz,
String resolvedPath, String masterDuplicatePath, String outputPath) {
// read the master-duplicate tuples
Dataset<MasterDuplicate> md = spark
.read()
.textFile(masterDuplicatePath)
.map(as(MasterDuplicate.class), Encoders.bean(MasterDuplicate.class));
// prepare the resolved CF|HB references with the corresponding EMPTY master ID
Dataset<IdCfHbMapping> resolved = spark
.read()
.textFile(inputPath)
.map(as(entityClazz), Encoders.bean(entityClazz))
.flatMap(flattenCfHbFn(), Encoders.bean(IdCfHbMapping.class));
// set the EMPTY master ID/NAME and save it
resolved
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicateId")))
.map(asIdCfHbMapping(), Encoders.bean(IdCfHbMapping.class))
.filter((FilterFunction<IdCfHbMapping>) m -> Objects.nonNull(m.getMasterId()))
.write()
.mode(SaveMode.Overwrite)
.json(resolvedPath);
// read again the resolved CF|HB mapping
Dataset<IdCfHbMapping> resolvedDS = spark
.read()
.textFile(resolvedPath)
.map(as(IdCfHbMapping.class), Encoders.bean(IdCfHbMapping.class));
// read the result table
Dataset<T> res = spark
.read()
.textFile(inputPath)
.map(as(entityClazz), Encoders.bean(entityClazz));
// Join the results with the resolved CF|HB mapping, apply the mapping and save it
res
.joinWith(resolvedDS, res.col("id").equalTo(resolvedDS.col("resultId")), "left")
.groupByKey((MapFunction<Tuple2<T, IdCfHbMapping>, String>) t -> t._1().getId(), Encoders.STRING())
.mapGroups(getMapGroupsFunction(), Encoders.bean(entityClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping> asIdCfHbMapping() {
return t -> {
final IdCfHbMapping mapping = t._1();
Optional
.ofNullable(t._2())
.ifPresent(t2 -> {
mapping.setMasterId(t2.getMasterId());
mapping.setMasterName(t2.getMasterName());
});
return mapping;
};
}
private static <T extends Result> FlatMapFunction<T, IdCfHbMapping> flattenCfHbFn() {
return r -> Stream
.concat(
Optional
.ofNullable(r.getCollectedfrom())
.map(cf -> cf.stream().map(KeyValue::getKey))
.orElse(Stream.empty()),
Stream
.concat(
Optional
.ofNullable(r.getInstance())
.map(
instances -> instances
.stream()
.map(i -> Optional.ofNullable(i.getHostedby()).map(KeyValue::getKey).orElse("")))
.orElse(Stream.empty())
.filter(StringUtils::isNotBlank),
Optional
.ofNullable(r.getInstance())
.map(
instances -> instances
.stream()
.map(
i -> Optional
.ofNullable(i.getCollectedfrom())
.map(KeyValue::getKey)
.orElse("")))
.orElse(Stream.empty())
.filter(StringUtils::isNotBlank)))
.distinct()
.filter(StringUtils::isNotBlank)
.map(cfHb -> asIdCfHbMapping(r.getId(), cfHb))
.iterator();
}
private static <T extends Result> MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T> getMapGroupsFunction() {
return new MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T>() {
@Override
public T call(String key, Iterator<Tuple2<T, IdCfHbMapping>> values) {
final Tuple2<T, IdCfHbMapping> first = values.next();
final T res = first._1();
updateResult(res, first._2());
values.forEachRemaining(t -> updateResult(res, t._2()));
return res;
}
private void updateResult(T res, IdCfHbMapping m) {
if (Objects.nonNull(m)) {
res.getCollectedfrom().forEach(kv -> updateKeyValue(kv, m));
res.getInstance().forEach(i -> {
updateKeyValue(i.getHostedby(), m);
updateKeyValue(i.getCollectedfrom(), m);
});
}
}
private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
if (kv.getKey().equals(a.getCfhb())) {
kv.setKey(a.getMasterId());
kv.setValue(a.getMasterName());
}
}
};
}
private static IdCfHbMapping asIdCfHbMapping(String resultId, String cfHb) {
IdCfHbMapping m = new IdCfHbMapping(resultId);
m.setCfhb(cfHb);
return m;
}
private static <R> MapFunction<String, R> as(Class<R> clazz) {
return s -> OBJECT_MAPPER.readValue(s, clazz);
}
}

View File

@ -1,211 +0,0 @@
package eu.dnetlib.dhp.oa.graph.clean.country;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.swing.text.html.Option;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author miriam.baglioni
* @Date 20/07/22
*/
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public class CleanCountrySparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CleanCountrySparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CleanCountrySparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
String datasourcePath = parser.get("hostedBy");
log.info("datasourcePath: {}", datasourcePath);
String country = parser.get("country");
log.info("country: {}", country);
String[] verifyParam = parser.get("verifyParam").split(";");
log.info("verifyParam: {}", verifyParam);
String collectedfrom = parser.get("collectedfrom");
log.info("collectedfrom: {}", collectedfrom);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
cleanCountry(
spark, country, verifyParam, inputPath, entityClazz, workingDir, collectedfrom, datasourcePath);
});
}
private static <T extends Result> void cleanCountry(SparkSession spark, String country, String[] verifyParam,
String inputPath, Class<T> entityClazz, String workingDir, String collectedfrom, String datasourcePath) {
List<String> hostedBy = spark
.read()
.textFile(datasourcePath)
.collectAsList();
Dataset<T> res = spark
.read()
.textFile(inputPath)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz));
res.map((MapFunction<T, T>) r -> {
if (r.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
!r.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
return r;
}
List<StructuredProperty> ids = getPidsAndAltIds(r).collect(Collectors.toList());
if (ids
.stream()
.anyMatch(
p -> p
.getQualifier()
.getClassid()
.equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) {
r
.setCountry(
r
.getCountry()
.stream()
.filter(
c -> toTakeCountry(c, country))
.collect(Collectors.toList()));
}
return r;
}, Encoders.bean(entityClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir);
spark
.read()
.textFile(workingDir)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
}
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
final Stream<StructuredProperty> resultPids = Optional
.ofNullable(r.getPid())
.map(Collection::stream)
.orElse(Stream.empty());
final Stream<StructuredProperty> instancePids = Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.flatMap(
i -> Optional
.ofNullable(i.getPid())
.map(Collection::stream)
.orElse(Stream.empty())))
.orElse(Stream.empty());
final Stream<StructuredProperty> instanceAltIds = Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.flatMap(
i -> Optional
.ofNullable(i.getAlternateIdentifier())
.map(Collection::stream)
.orElse(Stream.empty())))
.orElse(Stream.empty());
return Stream
.concat(
Stream.concat(resultPids, instancePids),
instanceAltIds);
}
private static boolean pidInParam(String value, String[] verifyParam) {
for (String s : verifyParam)
if (value.startsWith(s))
return true;
return false;
}
private static boolean toTakeCountry(Country c, String country) {
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
// inserted via propagation
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
return true;
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
return true;
return !(c
.getClassid()
.equalsIgnoreCase(country) &&
c.getDataInfo().getInferenceprovenance().equals("propagation"));
}
}

View File

@ -83,12 +83,57 @@
</property>
</parameters>
<start to="fork_clean_graph"/>
<start to="prepare_info"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<fork name="prepare_info">
<path start="select_datasourceId_from_country"/>
<path start="get_ds_master_duplicate"/>
</fork>
<action name="select_datasourceId_from_country">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Select datasource ID from country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.GetDatasourceFromCountry</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=10000
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--country</arg><arg>${country}</arg>
</spark>
<ok to="wait_prepare"/>
<error to="Kill"/>
</action>
<action name="get_ds_master_duplicate">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction</main-class>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
</java>
<ok to="wait_prepare"/>
<error to="Kill"/>
</action>
<join name="wait_prepare" to="fork_clean_graph"/>
<fork name="fork_clean_graph">
<path start="clean_publication"/>
<path start="clean_dataset"/>
@ -115,12 +160,20 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.sql.shuffle.partitions=15000
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--deepClean</arg><arg>${shouldClean}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
@ -141,12 +194,20 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--deepClean</arg><arg>${shouldClean}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
@ -167,12 +228,20 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.sql.shuffle.partitions=5000
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--deepClean</arg><arg>${shouldClean}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
@ -193,12 +262,20 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.sql.shuffle.partitions=2000
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--deepClean</arg><arg>${shouldClean}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
@ -219,12 +296,20 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.sql.shuffle.partitions=1000
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--deepClean</arg><arg>${shouldClean}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
@ -245,12 +330,20 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.sql.shuffle.partitions=1000
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--deepClean</arg><arg>${shouldClean}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
@ -271,12 +364,20 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.sql.shuffle.partitions=2000
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--deepClean</arg><arg>${shouldClean}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
@ -297,486 +398,26 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.sql.shuffle.partitions=20000
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--deepClean</arg><arg>${shouldClean}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<join name="wait_clean" to="clean_context"/>
<decision name="clean_context">
<switch>
<case to="fork_clean_context">${wf:conf('shouldClean') eq true}</case>
<default to="End"/>
</switch>
</decision>
<fork name="fork_clean_context">
<path start="clean_publication_context"/>
<path start="clean_dataset_context"/>
<path start="clean_otherresearchproduct_context"/>
<path start="clean_software_context"/>
</fork>
<action name="clean_publication_context">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean publications context</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
</spark>
<ok to="wait_clean_context"/>
<error to="Kill"/>
</action>
<action name="clean_dataset_context">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean datasets Context</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
</spark>
<ok to="wait_clean_context"/>
<error to="Kill"/>
</action>
<action name="clean_otherresearchproduct_context">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean otherresearchproducts context</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
</spark>
<ok to="wait_clean_context"/>
<error to="Kill"/>
</action>
<action name="clean_software_context">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean softwares context</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
</spark>
<ok to="wait_clean_context"/>
<error to="Kill"/>
</action>
<join name="wait_clean_context" to="select_datasourceId_from_country"/>
<action name="select_datasourceId_from_country">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Select datasource ID from country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.GetDatasourceFromCountry</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--country</arg><arg>${country}</arg>
</spark>
<ok to="fork_clean_country"/>
<error to="Kill"/>
</action>
<fork name="fork_clean_country">
<path start="clean_publication_country"/>
<path start="clean_dataset_country"/>
<path start="clean_otherresearchproduct_country"/>
<path start="clean_software_country"/>
</fork>
<action name="clean_publication_country">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean publication country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
</spark>
<ok to="wait_clean_country"/>
<error to="Kill"/>
</action>
<action name="clean_dataset_country">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean dataset country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
</spark>
<ok to="wait_clean_country"/>
<error to="Kill"/>
</action>
<action name="clean_otherresearchproduct_country">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean otherresearchproduct country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
</spark>
<ok to="wait_clean_country"/>
<error to="Kill"/>
</action>
<action name="clean_software_country">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean software country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
</spark>
<ok to="wait_clean_country"/>
<error to="Kill"/>
</action>
<join name="wait_clean_country" to="should_patch_datasource_ids"/>
<decision name="should_patch_datasource_ids">
<switch>
<case to="get_ds_master_duplicate">${wf:conf('shouldClean') eq true}</case>
<default to="End"/>
</switch>
</decision>
<action name="get_ds_master_duplicate">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction</main-class>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
</java>
<ok to="fork_patch_cfhb"/>
<error to="Kill"/>
</action>
<fork name="fork_patch_cfhb">
<path start="patch_publication_cfhb"/>
<path start="patch_dataset_cfhb"/>
<path start="patch_otherresearchproduct_cfhb"/>
<path start="patch_software_cfhb"/>
</fork>
<action name="patch_publication_cfhb">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>patch publication cfhb</name>
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="wait_clean_cfhb"/>
<error to="Kill"/>
</action>
<action name="patch_dataset_cfhb">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>patch dataset cfhb</name>
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="wait_clean_cfhb"/>
<error to="Kill"/>
</action>
<action name="patch_otherresearchproduct_cfhb">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>patch otherresearchproduct cfhb</name>
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="wait_clean_cfhb"/>
<error to="Kill"/>
</action>
<action name="patch_software_cfhb">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>patch software cfhb</name>
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="wait_clean_cfhb"/>
<error to="Kill"/>
</action>
<join name="wait_clean_cfhb" to="fork_copy_cfhb_patched_results"/>
<fork name="fork_copy_cfhb_patched_results">
<path start="copy_cfhb_patched_publication"/>
<path start="copy_cfhb_patched_dataset"/>
<path start="copy_cfhb_patched_otherresearchproduct"/>
<path start="copy_cfhb_patched_software"/>
</fork>
<action name="copy_cfhb_patched_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${graphOutputPath}/publication"/>
</prepare>
<arg>${workingDir}/cfHbPatched/publication</arg>
<arg>${graphOutputPath}/publication</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_cfhb_patched_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${graphOutputPath}/dataset"/>
</prepare>
<arg>${workingDir}/cfHbPatched/dataset</arg>
<arg>${graphOutputPath}/dataset</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_cfhb_patched_otherresearchproduct">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${graphOutputPath}/otherresearchproduct"/>
</prepare>
<arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
<arg>${graphOutputPath}/otherresearchproduct</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_cfhb_patched_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${graphOutputPath}/software"/>
</prepare>
<arg>${workingDir}/cfHbPatched/software</arg>
<arg>${graphOutputPath}/software</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="End"/>
<join name="wait_clean" to="End"/>
<end name="End"/>

View File

@ -28,5 +28,53 @@
"paramLongName": "graphTableClassName",
"paramDescription": "class name moelling the graph table",
"paramRequired": true
},
{
"paramName": "ci",
"paramLongName": "contextId",
"paramDescription": "the id of the context to be removed",
"paramRequired": false
},
{
"paramName": "vf",
"paramLongName": "verifyParam",
"paramDescription": "the parameter to be verified to remove the context",
"paramRequired": false
},
{
"paramName": "c",
"paramLongName": "country",
"paramDescription": "the id of the context to be removed",
"paramRequired": false
},
{
"paramName": "vfc",
"paramLongName": "verifyCountryParam",
"paramDescription": "the parameter to be verified to remove the country",
"paramRequired": false
},
{
"paramName": "cf",
"paramLongName": "collectedfrom",
"paramDescription": "the collectedfrom value for which we should apply the cleaning",
"paramRequired": false
},
{
"paramName": "hb",
"paramLongName": "hostedBy",
"paramDescription": "the set of datasources having the specified country in the graph searched for in the hostedby of the results",
"paramRequired": false
},
{
"paramName": "md",
"paramLongName": "masterDuplicatePath",
"paramDescription": "path to the file on HDFS holding the datasource id tuples [master, duplicate]",
"paramRequired": false
},
{
"paramName": "dc",
"paramLongName": "deepClean",
"paramDescription": "flag to activate further cleaning steps",
"paramRequired": true
}
]

View File

@ -1,289 +0,0 @@
package eu.dnetlib.dhp.oa.graph.clean;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleanContextTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(CleanContextTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(CleanContextTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(CleanContextTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testResultClean() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_context.json")
.getPath();
final String prefix = "gcube ";
spark
.read()
.textFile(sourcePath)
.map(
(MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class),
Encoders.bean(Publication.class))
.write()
.json(workingDir.toString() + "/publication");
CleanContextSparkJob.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", workingDir.toString() + "/publication",
"--graphTableClassName", Publication.class.getCanonicalName(),
"--workingDir", workingDir.toString() + "/working",
"--contextId", "sobigdata",
"--verifyParam", "gCube "
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(7, tmp.count());
// original result with sobigdata context and gcube as starting string in the main title for the publication
Assertions
.assertEquals(
0,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
.collect()
.get(0)
.getContext()
.size());
// original result with sobigdata context without gcube as starting string in the main title for the publication
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
.collect()
.get(0)
.getContext()
.size());
Assertions
.assertEquals(
"sobigdata::projects::2",
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
.collect()
.get(0)
.getContext()
.get(0)
.getId());
// original result with sobigdata context with gcube as starting string in the subtitle
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
.collect()
.get(0)
.getContext()
.size());
Assertions
.assertEquals(
"sobigdata::projects::2",
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
.collect()
.get(0)
.getContext()
.get(0)
.getId());
List<StructuredProperty> titles = tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
.collect()
.get(0)
.getTitle();
Assertions.assertEquals(1, titles.size());
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
Assertions.assertEquals("subtitle", titles.get(0).getQualifier().getClassid());
// original result with sobigdata context with gcube not as starting string in the main title
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
.collect()
.get(0)
.getContext()
.size());
Assertions
.assertEquals(
"sobigdata::projects::1",
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
.collect()
.get(0)
.getContext()
.get(0)
.getId());
titles = tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
.collect()
.get(0)
.getTitle();
Assertions.assertEquals(1, titles.size());
Assertions.assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix));
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim()));
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
// original result with sobigdata in context and also other contexts with gcube as starting string for the main
// title
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
.collect()
.get(0)
.getContext()
.size());
Assertions
.assertEquals(
"dh-ch",
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
.collect()
.get(0)
.getContext()
.get(0)
.getId());
titles = tmp
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
.collect()
.get(0)
.getTitle();
Assertions.assertEquals(1, titles.size());
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
// original result with multiple main title one of which whith gcube as starting string and with 2 contextes
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
.collect()
.get(0)
.getContext()
.size());
Assertions
.assertEquals(
"dh-ch",
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
.collect()
.get(0)
.getContext()
.get(0)
.getId());
titles = tmp
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
.collect()
.get(0)
.getTitle();
Assertions.assertEquals(2, titles.size());
Assertions
.assertTrue(
titles
.stream()
.anyMatch(
t -> t.getQualifier().getClassid().equals("main title")
&& t.getValue().toLowerCase().startsWith(prefix)));
// original result without sobigdata in context with gcube as starting string for the main title
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
.collect()
.get(0)
.getContext()
.size());
Assertions
.assertEquals(
"dh-ch",
tmp
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
.collect()
.get(0)
.getContext()
.get(0)
.getId());
titles = tmp
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
.collect()
.get(0)
.getTitle();
Assertions.assertEquals(2, titles.size());
Assertions
.assertTrue(
titles
.stream()
.anyMatch(
t -> t.getQualifier().getClassid().equals("main title")
&& t.getValue().toLowerCase().startsWith(prefix)));
}
}

View File

@ -1,190 +0,0 @@
package eu.dnetlib.dhp.oa.graph.clean;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author miriam.baglioni
* @Date 20/07/22
*/
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Publication;
public class CleanCountryTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(CleanCountryTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(CleanCountryTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(CleanCountryTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testResultClean() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_country.json")
.getPath();
spark
.read()
.textFile(sourcePath)
.map(
(MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class),
Encoders.bean(Publication.class))
.write()
.json(workingDir.toString() + "/publication");
CleanCountrySparkJob.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", workingDir.toString() + "/publication",
"--graphTableClassName", Publication.class.getCanonicalName(),
"--workingDir", workingDir.toString() + "/working",
"--country", "NL",
"--verifyParam", "10.17632",
"--collectedfrom", "NARCIS",
"--hostedBy", getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
.getPath()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(8, tmp.count());
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
.collect()
.get(0)
.getCountry()
.size());
// original result with NL country and pid not starting with Mendely prefix
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
.collect()
.get(0)
.getCountry()
.size());
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS but not
// inserted with propagation
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
.collect()
.get(0)
.getCountry()
.size());
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS inserted with
// propagation
Assertions
.assertEquals(
0,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6ag"))
.collect()
.get(0)
.getCountry()
.size());
}
@Test
public void testDatasetClean() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json")
.getPath();
spark
.read()
.textFile(sourcePath)
.map(
(MapFunction<String, Dataset>) r -> OBJECT_MAPPER.readValue(r, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.json(workingDir.toString() + "/dataset");
CleanCountrySparkJob.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", workingDir.toString() + "/dataset",
"-graphTableClassName", Dataset.class.getCanonicalName(),
"-workingDir", workingDir.toString() + "/working",
"-country", "NL",
"-verifyParam", "10.17632",
"-collectedfrom", "NARCIS",
"-hostedBy", getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
.getPath()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(1, tmp.count());
Assertions.assertEquals(0, tmp.first().getCountry().size());
}
}

View File

@ -0,0 +1,924 @@
package eu.dnetlib.dhp.oa.graph.clean;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.FalseFileFilter;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
public class CleanGraphSparkJobTest {
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJobTest.class);
public static final ObjectMapper MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Mock
private ISLookUpService isLookUpService;
private VocabularyGroup vocabularies;
private CleaningRuleMap mapping;
private static SparkSession spark;
private static Path testBaseTmpPath;
private static String graphInputPath;
private static String graphOutputPath;
private static String dsMasterDuplicatePath;
@BeforeAll
public static void beforeAll() throws IOException, URISyntaxException {
testBaseTmpPath = Files.createTempDirectory(CleanGraphSparkJobTest.class.getSimpleName());
log.info("using test base path {}", testBaseTmpPath);
File basePath = Paths
.get(
Objects
.requireNonNull(
CleanGraphSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/graph"))
.toURI())
.toFile();
List<File> paths = FileUtils
.listFilesAndDirs(basePath, FalseFileFilter.FALSE, TrueFileFilter.TRUE)
.stream()
.filter(f -> !f.getAbsolutePath().endsWith("/graph"))
.collect(Collectors.toList());
for (File path : paths) {
String type = StringUtils.substringAfterLast(path.getAbsolutePath(), "/");
FileUtils
.copyDirectory(
path,
testBaseTmpPath.resolve("input").resolve("graph").resolve(type).toFile());
}
FileUtils
.copyFileToDirectory(
Paths
.get(
CleanGraphSparkJobTest.class
.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json")
.toURI())
.toFile(),
testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile());
graphInputPath = testBaseTmpPath.resolve("input").resolve("graph").toString();
graphOutputPath = testBaseTmpPath.resolve("output").resolve("graph").toString();
dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString();
SparkConf conf = new SparkConf();
conf.setAppName(CleanGraphSparkJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", testBaseTmpPath.toString());
conf.set("hive.metastore.warehouse.dir", testBaseTmpPath.resolve("warehouse").toString());
spark = SparkSession
.builder()
.config(conf)
.getOrCreate();
}
@BeforeEach
public void setUp() throws ISLookUpException, IOException {
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
lenient()
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
.thenReturn(synonyms());
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
mapping = CleaningRuleMap.create(vocabularies);
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
spark.stop();
}
@Test
void testCleanRelations() throws Exception {
spark
.read()
.textFile(graphInputPath.toString() + "/relation")
.map(as(Relation.class), Encoders.bean(Relation.class))
.collectAsList()
.forEach(
r -> assertFalse(
vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r.getRelClass())));
new CleanGraphSparkJob(
args(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
new String[] {
"--inputPath", graphInputPath + "/relation",
"--outputPath", graphOutputPath + "/relation",
"--isLookupUrl", "lookupurl",
"--graphTableClassName", Relation.class.getCanonicalName(),
"--deepClean", "false",
"--masterDuplicatePath", dsMasterDuplicatePath,
})).run(false, isLookUpService);
spark
.read()
.textFile(graphOutputPath.toString() + "/relation")
.map(as(Relation.class), Encoders.bean(Relation.class))
.collectAsList()
.forEach(r -> {
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r.getRelClass()));
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r.getSubRelType()));
assertEquals("iis", r.getProvenance().get(0).getDataInfo().getProvenanceaction().getClassid());
assertEquals("Inferred by OpenAIRE", r.getProvenance().get(0).getDataInfo().getProvenanceaction().getClassname());
});
}
@Test
void testFilter_invisible_true() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils
.toString(
Objects
.requireNonNull(
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json")));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
assertEquals(true, GraphCleaningFunctions.filter(p_in));
}
@Test
void testFilter_true_nothing_to_filter() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils
.toString(
Objects
.requireNonNull(
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
assertEquals(true, GraphCleaningFunctions.filter(p_in));
}
@Test
void testFilter_missing_invisible() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils
.toString(
Objects
.requireNonNull(
getClass()
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json")));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
assertEquals(true, GraphCleaningFunctions.filter(p_in));
}
@Test
void testCleaning_publication() throws Exception {
final String id = "50|CSC_________::2250a70c903c6ac6e4c01438259e9375";
Publication p_in = read(spark, graphInputPath + "/publication", Publication.class)
.filter(String.format("id = '%s'", id))
.first();
assertNull(p_in.getBestaccessright());
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
new CleanGraphSparkJob(
args(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
new String[] {
"--inputPath", graphInputPath + "/publication",
"--outputPath", graphOutputPath + "/publication",
"--isLookupUrl", "lookupurl",
"--graphTableClassName", Publication.class.getCanonicalName(),
"--deepClean", "false",
"--masterDuplicatePath", dsMasterDuplicatePath,
})).run(false, isLookUpService);
Publication p = read(spark, graphOutputPath + "/publication", Publication.class)
.filter(String.format("id = '%s'", id))
.first();
assertNull(p.getPublisher());
assertEquals("und", p.getLanguage().getClassid());
assertEquals("Undetermined", p.getLanguage().getClassname());
assertEquals("DE", p.getCountry().get(0).getClassid());
assertEquals("Germany", p.getCountry().get(0).getClassname());
assertEquals("0018", p.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p.getInstance().get(0).getInstancetype().getClassname());
assertEquals("0027", p.getInstance().get(1).getInstancetype().getClassid());
assertEquals("Model", p.getInstance().get(1).getInstancetype().getClassname());
assertEquals("0038", p.getInstance().get(2).getInstancetype().getClassid());
assertEquals("Other literature type", p.getInstance().get(2).getInstancetype().getClassname());
assertEquals("CLOSED", p.getInstance().get(0).getAccessright().getClassid());
assertEquals("Closed Access", p.getInstance().get(0).getAccessright().getClassname());
Set<String> pidTerms = vocabularies.getTerms(ModelConstants.DNET_PID_TYPES);
assertTrue(
p
.getPid()
.stream()
.map(StructuredProperty::getQualifier)
.allMatch(q -> pidTerms.contains(q.getClassid())));
List<Instance> poi = p.getInstance();
assertNotNull(poi);
assertEquals(3, poi.size());
final Instance poii = poi.get(0);
assertNotNull(poii);
assertNotNull(poii.getPid());
assertEquals(2, poii.getPid().size());
assertTrue(
poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
assertTrue(poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
assertNotNull(poii.getAlternateIdentifier());
assertEquals(1, poii.getAlternateIdentifier().size());
assertTrue(
poii
.getAlternateIdentifier()
.stream()
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
assertEquals(3, p.getTitle().size());
List<String> titles = p
.getTitle()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toList());
assertTrue(titles.contains("omic"));
assertTrue(
titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test"));
assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"));
assertEquals("CLOSED", p.getBestaccessright().getClassid());
assertNull(p.getPublisher());
assertEquals("1970-10-07", p.getDateofacceptance());
assertEquals("0038", p.getInstance().get(2).getInstancetype().getClassid());
assertEquals("Other literature type", p.getInstance().get(2).getInstancetype().getClassname());
final List<Instance> pci = p.getInstance();
assertNotNull(pci);
assertEquals(3, pci.size());
final Instance pcii = pci.get(0);
assertNotNull(pcii);
assertNotNull(pcii.getPid());
assertEquals(2, pcii.getPid().size());
assertTrue(
pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
assertTrue(pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
assertNotNull(pcii.getAlternateIdentifier());
assertEquals(1, pcii.getAlternateIdentifier().size());
assertTrue(
pcii
.getAlternateIdentifier()
.stream()
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
assertNotNull(p.getSubject());
List<Subject> fos_subjects = p
.getSubject()
.stream()
.filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))
.collect(Collectors.toList());
assertNotNull(fos_subjects);
assertEquals(2, fos_subjects.size());
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0101 mathematics".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"sysimport:crosswalk:datasetarchive"
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0102 computer and information sciences".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
verify_keyword(p, "In Situ Hybridization");
verify_keyword(p, "Avicennia");
}
@Test
void testCleanDoiBoost() throws IOException, ParseException, ISLookUpException, ClassNotFoundException {
verifyFiltering(1, "50|doi_________::b0baa0eb88a5788f0b8815560d2a32f2");
}
@Test
void testCleanDoiBoost2() throws IOException, ParseException, ISLookUpException, ClassNotFoundException {
verifyFiltering(1, "50|doi_________::4972b0ca81b96b225aed8038bb965656");
}
private void verifyFiltering(int expectedCount, String id)
throws ISLookUpException, ClassNotFoundException, IOException, ParseException {
new CleanGraphSparkJob(
args(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
new String[] {
"--inputPath", graphInputPath + "/publication",
"--outputPath", graphOutputPath + "/publication",
"--isLookupUrl", "lookupurl",
"--graphTableClassName", Publication.class.getCanonicalName(),
"--deepClean", "false",
"--masterDuplicatePath", dsMasterDuplicatePath,
})).run(false, isLookUpService);
Dataset<Publication> p = read(spark, graphOutputPath + "/publication", Publication.class)
.filter(String.format("id = '%s'", id));
assertEquals(expectedCount, p.count());
}
@Test
void testCleanContext() throws Exception {
final String prefix = "gcube ";
new CleanGraphSparkJob(
args(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
new String[] {
"--inputPath", graphInputPath + "/publication",
"--outputPath", graphOutputPath + "/publication",
"--isLookupUrl", "lookupurl",
"--graphTableClassName", Publication.class.getCanonicalName(),
"--deepClean", "true",
"--contextId", "sobigdata",
"--verifyParam", "gCube ",
"--masterDuplicatePath", dsMasterDuplicatePath,
"--country", "NL",
"--verifyCountryParam", "10.17632",
"--collectedfrom", "NARCIS",
"--hostedBy", Objects
.requireNonNull(
getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy"))
.getPath()
})).run(false, isLookUpService);
Dataset<Publication> pubs = read(spark, graphOutputPath + "/publication", Publication.class)
.filter((FilterFunction<Publication>) p1 -> StringUtils.endsWith(p1.getId(), "_ctx"));
assertEquals(7, pubs.count());
// original result with sobigdata context and gcube as starting string in the main title for the publication
assertEquals(
0,
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::0224aae28af558f21768dbc6439a_ctx"))
.first()
.getContext()
.size());
// original result with sobigdata context without gcube as starting string in the main title for the publication
assertEquals(
1,
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67d_ctx"))
.first()
.getContext()
.size());
assertEquals(
"sobigdata::projects::2",
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67d_ctx"))
.first()
.getContext()
.get(0)
.getId());
// original result with sobigdata context with gcube as starting string in the subtitle
assertEquals(
1,
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx"))
.first()
.getContext()
.size());
assertEquals(
"sobigdata::projects::2",
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx"))
.first()
.getContext()
.get(0)
.getId());
List<StructuredProperty> titles = pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx"))
.first()
.getTitle();
assertEquals(1, titles.size());
assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
assertEquals("subtitle", titles.get(0).getQualifier().getClassid());
// original result with sobigdata context with gcube not as starting string in the main title
assertEquals(
1,
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx"))
.first()
.getContext()
.size());
assertEquals(
"sobigdata::projects::1",
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx"))
.first()
.getContext()
.get(0)
.getId());
titles = pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx"))
.first()
.getTitle();
assertEquals(1, titles.size());
assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix));
assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim()));
assertEquals("main title", titles.get(0).getQualifier().getClassid());
// original result with sobigdata in context and also other contexts with gcube as starting string for the main
// title
assertEquals(
1,
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx"))
.first()
.getContext()
.size());
assertEquals(
"dh-ch",
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx"))
.first()
.getContext()
.get(0)
.getId());
titles = pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx"))
.first()
.getTitle();
assertEquals(1, titles.size());
assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
assertEquals("main title", titles.get(0).getQualifier().getClassid());
// original result with multiple main title one of which whith gcube as starting string and with 2 contextes
assertEquals(
1,
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx"))
.first()
.getContext()
.size());
assertEquals(
"dh-ch",
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx"))
.first()
.getContext()
.get(0)
.getId());
titles = pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx"))
.first()
.getTitle();
assertEquals(2, titles.size());
assertTrue(
titles
.stream()
.anyMatch(
t -> t.getQualifier().getClassid().equals("main title")
&& t.getValue().toLowerCase().startsWith(prefix)));
// original result without sobigdata in context with gcube as starting string for the main title
assertEquals(
1,
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx"))
.first()
.getContext()
.size());
assertEquals(
"dh-ch",
pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx"))
.first()
.getContext()
.get(0)
.getId());
titles = pubs
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx"))
.first()
.getTitle();
assertEquals(2, titles.size());
assertTrue(
titles
.stream()
.anyMatch(
t -> t.getQualifier().getClassid().equals("main title")
&& t.getValue().toLowerCase().startsWith(prefix)));
}
@Test
void testCleanCfHbSparkJob() throws Exception {
final Dataset<Publication> pubs_in = read(spark, graphInputPath + "/publication", Publication.class);
final Publication p1_in = pubs_in
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13b_cfhb'")
.first();
assertEquals("10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", p1_in.getCollectedfrom().get(0).getKey());
assertEquals("Bacterial Protein Interaction Database - DUP", p1_in.getCollectedfrom().get(0).getValue());
assertEquals(
"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc",
p1_in.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"Bacterial Protein Interaction Database - DUP", p1_in.getInstance().get(0).getCollectedfrom().getValue());
final Publication p2_in = pubs_in
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3a_cfhb'")
.first();
assertEquals("10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", p2_in.getCollectedfrom().get(0).getKey());
assertEquals("FILUR DATA - DUP", p2_in.getCollectedfrom().get(0).getValue());
assertEquals(
"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35",
p2_in.getInstance().get(0).getCollectedfrom().getKey());
assertEquals("FILUR DATA - DUP", p2_in.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|re3data_____::6ffd7bc058f762912dc494cd9c175341", p2_in.getInstance().get(0).getHostedby().getKey());
assertEquals("depositar - DUP", p2_in.getInstance().get(0).getHostedby().getValue());
final Publication p3_in = pubs_in
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7d_cfhb'")
.first();
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_in.getCollectedfrom().get(0).getKey());
assertEquals("DANS (Data Archiving and Networked Services)", p3_in.getCollectedfrom().get(0).getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f",
p3_in.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", p3_in.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_in.getInstance().get(0).getHostedby().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", p3_in.getInstance().get(0).getHostedby().getValue());
new CleanGraphSparkJob(
args(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
new String[] {
"--inputPath", graphInputPath + "/publication",
"--outputPath", graphOutputPath + "/publication",
"--isLookupUrl", "lookupurl",
"--graphTableClassName", Publication.class.getCanonicalName(),
"--deepClean", "true",
"--contextId", "sobigdata",
"--verifyParam", "gCube ",
"--masterDuplicatePath", dsMasterDuplicatePath,
"--country", "NL",
"--verifyCountryParam", "10.17632",
"--collectedfrom", "NARCIS",
"--hostedBy", Objects
.requireNonNull(
getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy"))
.getPath()
})).run(false, isLookUpService);
assertTrue(Files.exists(Paths.get(graphOutputPath, "publication")));
final Dataset<Publication> pubs_out = read(spark, graphOutputPath + "/publication", Publication.class)
.filter((FilterFunction<Publication>) p -> StringUtils.endsWith(p.getId(), "_cfhb"));
assertEquals(3, pubs_out.count());
final Publication p1_out = pubs_out
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13b_cfhb'")
.first();
assertEquals("10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", p1_out.getCollectedfrom().get(0).getKey());
assertEquals("Bacterial Protein Interaction Database", p1_out.getCollectedfrom().get(0).getValue());
assertEquals(
"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce",
p1_out.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"Bacterial Protein Interaction Database", p1_out.getInstance().get(0).getCollectedfrom().getValue());
final Publication p2_out = pubs_out
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3a_cfhb'")
.first();
assertEquals("10|re3data_____::fc1db64b3964826913b1e9eafe830490", p2_out.getCollectedfrom().get(0).getKey());
assertEquals("FULIR Data", p2_out.getCollectedfrom().get(0).getValue());
assertEquals(
"10|re3data_____::fc1db64b3964826913b1e9eafe830490",
p2_out.getInstance().get(0).getCollectedfrom().getKey());
assertEquals("FULIR Data", p2_out.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|fairsharing_::3f647cadf56541fb9513cb63ec370187", p2_out.getInstance().get(0).getHostedby().getKey());
assertEquals("depositar", p2_out.getInstance().get(0).getHostedby().getValue());
final Publication p3_out = pubs_out
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7d_cfhb'")
.first();
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_out.getCollectedfrom().get(0).getKey());
assertEquals("DANS (Data Archiving and Networked Services)", p3_out.getCollectedfrom().get(0).getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f",
p3_out.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", p3_out.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_out.getInstance().get(0).getHostedby().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", p3_out.getInstance().get(0).getHostedby().getValue());
}
@Test
void testCleanCountry() throws Exception {
new CleanGraphSparkJob(
args(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
new String[] {
"--inputPath", graphInputPath + "/publication",
"--outputPath", graphOutputPath + "/publication",
"--isLookupUrl", "lookupurl",
"--graphTableClassName", Publication.class.getCanonicalName(),
"--deepClean", "true",
"--contextId", "sobigdata",
"--verifyParam", "gCube ",
"--masterDuplicatePath", dsMasterDuplicatePath,
"--country", "NL",
"--verifyCountryParam", "10.17632",
"--collectedfrom", "NARCIS",
"--hostedBy", Objects
.requireNonNull(
getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy"))
.getPath()
})).run(false, isLookUpService);
final Dataset<Publication> pubs_out = read(spark, graphOutputPath + "/publication", Publication.class)
.filter((FilterFunction<Publication>) p -> StringUtils.endsWith(p.getId(), "_country"));
assertEquals(8, pubs_out.count());
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
assertEquals(
1,
pubs_out
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::0224aae28af558f21768dbc6_country"))
.first()
.getCountry()
.size());
// original result with NL country and pid not starting with Mendely prefix
assertEquals(
1,
pubs_out
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1_country"))
.first()
.getCountry()
.size());
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS but not
// inserted with propagation
assertEquals(
1,
pubs_out
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::3c81248c335f0aa07e06817e_country"))
.first()
.getCountry()
.size());
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS inserted with
// propagation
assertEquals(
0,
pubs_out
.filter(
(FilterFunction<Publication>) p -> p
.getId()
.equals("50|DansKnawCris::3c81248c335f0aa07e06817d_country"))
.first()
.getCountry()
.size());
}
private List<String> vocs() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")));
}
private List<String> synonyms() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")));
}
private <R> org.apache.spark.sql.Dataset<R> read(SparkSession spark, String path, Class<R> clazz) {
return spark
.read()
.textFile(path)
.map(as(clazz), Encoders.bean(clazz));
}
private static <R> MapFunction<String, R> as(Class<R> clazz) {
return s -> MAPPER.readValue(s, clazz);
}
private static String classPathResourceAsString(String path) throws IOException {
return IOUtils
.toString(
Objects
.requireNonNull(
CleanGraphSparkJobTest.class.getResourceAsStream(path)));
}
private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
parser.parseArgument(args);
return parser;
}
private static void verify_keyword(Publication p_cleaned, String subject) {
Optional<Subject> s1 = p_cleaned
.getSubject()
.stream()
.filter(s -> s.getValue().equals(subject))
.findFirst();
assertTrue(s1.isPresent());
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassid());
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassname());
}
}

View File

@ -13,7 +13,6 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.MappableBlock;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -59,7 +58,7 @@ public class GraphCleaningFunctionsTest {
void testCleanRelations() throws Exception {
List<String> lines = IOUtils
.readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/relation.json"));
.readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/graph/relation/relation.json"));
for (String json : lines) {
Relation r_in = MAPPER.readValue(json, Relation.class);
assertNotNull(r_in);

View File

@ -1,213 +0,0 @@
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Publication;
public class CleanCfHbSparkJobTest {
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJobTest.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path testBaseTmpPath;
private static String resolvedPath;
private static String graphInputPath;
private static String graphOutputPath;
private static String dsMasterDuplicatePath;
@BeforeAll
public static void beforeAll() throws IOException, URISyntaxException {
testBaseTmpPath = Files.createTempDirectory(CleanCfHbSparkJobTest.class.getSimpleName());
log.info("using test base path {}", testBaseTmpPath);
final File entitiesSources = Paths
.get(CleanCfHbSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/entities").toURI())
.toFile();
FileUtils
.copyDirectory(
entitiesSources,
testBaseTmpPath.resolve("input").resolve("entities").toFile());
FileUtils
.copyFileToDirectory(
Paths
.get(
CleanCfHbSparkJobTest.class
.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json")
.toURI())
.toFile(),
testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile());
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
resolvedPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbResolved").toString();
graphOutputPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbPatched").toString();
dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString();
SparkConf conf = new SparkConf();
conf.setAppName(CleanCfHbSparkJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("spark.ui.enabled", "false");
spark = SparkSession
.builder()
.appName(CleanCfHbSparkJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
spark.stop();
}
@Test
void testCleanCfHbSparkJob() throws Exception {
final String outputPath = graphOutputPath + "/dataset";
final String inputPath = graphInputPath + "/dataset";
org.apache.spark.sql.Dataset<Dataset> records = read(spark, inputPath, Dataset.class);
Dataset d = records
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
.first();
assertEquals("10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getCollectedfrom().get(0).getKey());
assertEquals("Bacterial Protein Interaction Database - DUP", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"Bacterial Protein Interaction Database - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
d = records
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
.first();
assertEquals("10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getCollectedfrom().get(0).getKey());
assertEquals("FILUR DATA - DUP", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals("FILUR DATA - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|re3data_____::6ffd7bc058f762912dc494cd9c175341", d.getInstance().get(0).getHostedby().getKey());
assertEquals("depositar - DUP", d.getInstance().get(0).getHostedby().getValue());
d = records
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
.first();
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
CleanCfHbSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", inputPath,
"--outputPath", outputPath,
"--resolvedPath", resolvedPath + "/dataset",
"--graphTableClassName", Dataset.class.getCanonicalName(),
"--masterDuplicatePath", dsMasterDuplicatePath
});
assertTrue(Files.exists(Paths.get(graphOutputPath, "dataset")));
records = read(spark, outputPath, Dataset.class);
assertEquals(3, records.count());
d = records
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
.first();
assertEquals("10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getCollectedfrom().get(0).getKey());
assertEquals("Bacterial Protein Interaction Database", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals("Bacterial Protein Interaction Database", d.getInstance().get(0).getCollectedfrom().getValue());
d = records
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
.first();
assertEquals("10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getCollectedfrom().get(0).getKey());
assertEquals("FULIR Data", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals("FULIR Data", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|fairsharing_::3f647cadf56541fb9513cb63ec370187", d.getInstance().get(0).getHostedby().getKey());
assertEquals("depositar", d.getInstance().get(0).getHostedby().getValue());
d = records
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
.first();
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
d = records
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
.first();
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
}
private <R> org.apache.spark.sql.Dataset<R> read(SparkSession spark, String path, Class<R> clazz) {
return spark
.read()
.textFile(path)
.map(as(clazz), Encoders.bean(clazz));
}
private static <R> MapFunction<String, R> as(Class<R> clazz) {
return s -> OBJECT_MAPPER.readValue(s, clazz);
}
}

View File

@ -0,0 +1,11 @@
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<root level="info">
<appender-ref ref="STDOUT" />
</root>
</configuration>