merged stable_ids into openorgswf

This commit is contained in:
miconis 2021-03-25 10:44:49 +01:00
commit 28c1cdd132
284 changed files with 16485 additions and 1635 deletions

View File

@ -29,6 +29,12 @@
<artifactId>spark-sql_2.11</artifactId> <artifactId>spark-sql_2.11</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>commons-cli</groupId> <groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId> <artifactId>commons-cli</artifactId>
@ -98,11 +104,6 @@
<artifactId>dnet-pace-core</artifactId> <artifactId>dnet-pace-core</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -32,27 +33,33 @@ public class AuthorMerger {
} }
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) { public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
int pa = countAuthorsPids(a); int pa = countAuthorsPids(a);
int pb = countAuthorsPids(b); int pb = countAuthorsPids(b);
List<Author> base, enrich; List<Author> base, enrich;
int sa = authorsSize(a); int sa = authorsSize(a);
int sb = authorsSize(b); int sb = authorsSize(b);
if (pa == pb) { if (sa == sb) {
base = sa > sb ? a : b;
enrich = sa > sb ? b : a;
} else {
base = pa > pb ? a : b; base = pa > pb ? a : b;
enrich = pa > pb ? b : a; enrich = pa > pb ? b : a;
} else {
base = sa > sb ? a : b;
enrich = sa > sb ? b : a;
} }
enrichPidFromList(base, enrich); enrichPidFromList(base, enrich, threshold);
return base; return base;
} }
private static void enrichPidFromList(List<Author> base, List<Author> enrich) { public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
return mergeAuthor(a, b, THRESHOLD);
}
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
if (base == null || enrich == null) if (base == null || enrich == null)
return; return;
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
final Map<String, Author> basePidAuthorMap = base final Map<String, Author> basePidAuthorMap = base
.stream() .stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0) .filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -63,6 +70,7 @@ public class AuthorMerger {
.map(p -> new Tuple2<>(pidToComparableString(p), a))) .map(p -> new Tuple2<>(pidToComparableString(p), a)))
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
// <pid, Author> (list of pid that are missing in the other list)
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
.stream() .stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0) .filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -83,10 +91,10 @@ public class AuthorMerger {
.max(Comparator.comparing(Tuple2::_1)); .max(Comparator.comparing(Tuple2::_1));
if (simAuthor.isPresent()) { if (simAuthor.isPresent()) {
double th = THRESHOLD; double th = threshold;
// increase the threshold if the surname is too short // increase the threshold if the surname is too short
if (simAuthor.get()._2().getSurname() != null if (simAuthor.get()._2().getSurname() != null
&& simAuthor.get()._2().getSurname().length() <= 3) && simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
th = 0.99; th = 0.99;
if (simAuthor.get()._1() > th) { if (simAuthor.get()._1() > th) {
@ -156,7 +164,7 @@ public class AuthorMerger {
} }
private static String normalize(final String s) { private static String normalize(final String s) {
return nfd(s) String[] normalized = nfd(s)
.toLowerCase() .toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError // do not compact the regexes in a single expression, would cause StackOverflowError
// in case // in case
@ -166,7 +174,12 @@ public class AuthorMerger {
.replaceAll("(\\p{Punct})+", " ") .replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ") .replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ") .replaceAll("(\\n)+", " ")
.trim(); .trim()
.split(" ");
Arrays.sort(normalized);
return String.join(" ", normalized);
} }
private static String nfd(final String s) { private static String nfd(final String s) {

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.schema.oaf;
import java.util.*; import java.util.*;
import java.util.function.Function; import java.util.function.Function;
@ -8,14 +8,16 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists; import com.clearspring.analytics.util.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.PidBlacklistProvider;
public class CleaningFunctions { public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10.)";
public static final String DOI_PREFIX = "10.";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
@ -78,7 +80,7 @@ public class CleaningFunctions {
return value; return value;
} }
public static <T extends Oaf> T fixDefaults(T value) { public static <T extends Oaf> T cleanup(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -86,7 +88,7 @@ public class CleaningFunctions {
} else if (value instanceof Organization) { } else if (value instanceof Organization) {
Organization o = (Organization) value; Organization o = (Organization) value;
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE)); o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
} }
} else if (value instanceof Relation) { } else if (value instanceof Relation) {
// nothing to clean here // nothing to clean here
@ -137,28 +139,32 @@ public class CleaningFunctions {
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getPid())) { if (Objects.nonNull(r.getPid())) {
r r.setPid(processPidCleaning(r.getPid()));
.setPid(
r
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.collect(Collectors.toList()));
} }
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r r
.setResourcetype( .setResourcetype(
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
} }
if (Objects.nonNull(r.getInstance())) { if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) { for (Instance i : r.getInstance()) {
final Set<StructuredProperty> pids = Sets.newHashSet(i.getPid());
i
.setAlternateIdentifier(
Optional
.ofNullable(i.getAlternateIdentifier())
.map(
altId -> altId
.stream()
.filter(p -> !pids.contains(p))
.collect(Collectors.toList()))
.orElse(Lists.newArrayList()));
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); i
.setAccessright(
accessRight(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
} }
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
@ -169,7 +175,7 @@ public class CleaningFunctions {
} }
} }
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance()); Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) { if (Objects.isNull(bestaccessrights)) {
r r
.setBestaccessright( .setBestaccessright(
@ -189,7 +195,6 @@ public class CleaningFunctions {
author.setRank(i++); author.setRank(i++);
} }
} }
for (Author a : r.getAuthor()) { for (Author a : r.getAuthor()) {
if (Objects.isNull(a.getPid())) { if (Objects.isNull(a.getPid())) {
a.setPid(Lists.newArrayList()); a.setPid(Lists.newArrayList());
@ -202,29 +207,14 @@ public class CleaningFunctions {
.filter(p -> Objects.nonNull(p.getQualifier())) .filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> StringUtils.isNotBlank(p.getValue())) .filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> { .map(p -> {
// hack to distinguish orcid from orcid_pending
String pidProvenance = Optional
.ofNullable(p.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
if (pidProvenance.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
p.getQualifier().setClassid(ModelConstants.ORCID);
} else {
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
}
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
return p; return p;
}) })
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.collect( .collect(
Collectors Collectors
.toMap( .toMap(
p -> p.getQualifier().getClassid() + p.getValue(), StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
Function.identity(),
(p1, p2) -> p1,
LinkedHashMap::new)) LinkedHashMap::new))
.values() .values()
.stream() .stream()
@ -247,6 +237,19 @@ public class CleaningFunctions {
return value; return value;
} }
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
return pids
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(Collectors.toList());
}
protected static StructuredProperty cleanValue(StructuredProperty s) { protected static StructuredProperty cleanValue(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s; return s;
@ -266,12 +269,39 @@ public class CleaningFunctions {
} }
} }
private static AccessRight accessRight(String classid, String classname, String scheme) {
return OafMapperUtils
.accessRight(
classid, classname, scheme, scheme);
}
private static Qualifier qualifier(String classid, String classname, String scheme) { private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils return OafMapperUtils
.qualifier( .qualifier(
classid, classname, scheme, scheme); classid, classname, scheme, scheme);
} }
/**
* Utility method that filter PID values on a per-type basis.
* @param s the PID whose value will be checked.
* @return false if the pid matches the filter criteria, true otherwise.
*/
public static boolean pidFilter(StructuredProperty s) {
final String pidValue = s.getValue();
if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(pidValue) ||
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
return false;
}
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
return false;
}
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
return false;
}
return true;
}
/** /**
* Utility method that normalises PID values on a per-type basis. * Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised. * @param pid the PID whose value will be normalised.
@ -286,7 +316,7 @@ public class CleaningFunctions {
// TODO add cleaning for more PID types as needed // TODO add cleaning for more PID types as needed
case "doi": case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, "")); pid.setValue(value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX));
break; break;
} }
return pid; return pid;

View File

@ -0,0 +1,22 @@
package eu.dnetlib.dhp.schema.oaf;
public class ModelHardLimits {
public static final String LAYOUT = "index";
public static final String INTERPRETATION = "openaire";
public static final String SEPARATOR = "-";
public static final int MAX_EXTERNAL_ENTITIES = 50;
public static final int MAX_AUTHORS = 200;
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
public static final int MAX_TITLE_LENGTH = 5000;
public static final int MAX_TITLES = 10;
public static final int MAX_ABSTRACT_LENGTH = 150000;
public static final int MAX_INSTANCES = 10;
public static String getCollectionName(String format) {
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
}
}

View File

@ -1,11 +1,9 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import java.util.ArrayList; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.util.Arrays;
import java.util.List; import java.util.*;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.Predicate; import java.util.function.Predicate;
@ -13,40 +11,45 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
public class OafMapperUtils { public class OafMapperUtils {
public static Oaf merge(final Oaf o1, final Oaf o2) { public static Oaf merge(final Oaf left, final Oaf right) {
if (ModelSupport.isSubClass(o1, OafEntity.class)) { if (ModelSupport.isSubClass(left, OafEntity.class)) {
if (ModelSupport.isSubClass(o1, Result.class)) { return mergeEntities((OafEntity) left, (OafEntity) right);
} else if (ModelSupport.isSubClass(left, Relation.class)) {
return mergeResults((Result) o1, (Result) o2); ((Relation) left).mergeFrom((Relation) right);
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
((Datasource) o1).mergeFrom((Datasource) o2);
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
((Organization) o1).mergeFrom((Organization) o2);
} else if (ModelSupport.isSubClass(o1, Project.class)) {
((Project) o1).mergeFrom((Project) o2);
} else {
throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
}
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
((Relation) o1).mergeFrom((Relation) o2);
} else { } else {
throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName()); throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
} }
return o1; return left;
} }
public static Result mergeResults(Result r1, Result r2) { public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
if (new ResultTypeComparator().compare(r1, r2) < 0) { if (ModelSupport.isSubClass(left, Result.class)) {
r1.mergeFrom(r2); return mergeResults((Result) left, (Result) right);
return r1; } else if (ModelSupport.isSubClass(left, Datasource.class)) {
((Datasource) left).mergeFrom((Datasource) right);
} else if (ModelSupport.isSubClass(left, Organization.class)) {
((Organization) left).mergeFrom((Organization) right);
} else if (ModelSupport.isSubClass(left, Project.class)) {
((Project) left).mergeFrom((Project) right);
} else { } else {
r2.mergeFrom(r1); throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
return r2; }
return left;
}
public static Result mergeResults(Result left, Result right) {
if (new ResultTypeComparator().compare(left, right) < 0) {
left.mergeFrom(right);
return left;
} else {
right.mergeFrom(left);
return right;
} }
} }
@ -102,6 +105,29 @@ public class OafMapperUtils {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename); return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
} }
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
return accessRight(classid, classname, schemeid, schemename, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setSchemename(schemename);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static Qualifier qualifier( public static Qualifier qualifier(
final String classid, final String classid,
final String classname, final String classname,
@ -115,6 +141,15 @@ public class OafMapperUtils {
return q; return q;
} }
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
q.setSchemename(qualifier.getSchemename());
return q;
}
public static StructuredProperty structuredProperty( public static StructuredProperty structuredProperty(
final String value, final String value,
final String classid, final String classid,
@ -294,4 +329,36 @@ public class OafMapperUtils {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>(); final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null; return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
} }
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new AccessRightComparator<>());
final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
} }

View File

@ -1,7 +1,15 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -17,6 +25,16 @@ public class ResultTypeComparator implements Comparator<Result> {
if (right == null) if (right == null)
return -1; return -1;
HashSet<String> lCf = getCollectedFromIds(left);
HashSet<String> rCf = getCollectedFromIds(right);
if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
return -1;
}
if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
return 1;
}
String lClass = left.getResulttype().getClassid(); String lClass = left.getResulttype().getClassid();
String rClass = right.getResulttype().getClassid(); String rClass = right.getResulttype().getClassid();
@ -46,4 +64,15 @@ public class ResultTypeComparator implements Comparator<Result> {
// Else (but unlikely), lexicographical ordering will do. // Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass); return lClass.compareTo(rClass);
} }
protected HashSet<String> getCollectedFromIds(Result left) {
return Optional
.ofNullable(left.getCollectedfrom())
.map(
cf -> cf
.stream()
.map(c -> c.getKey())
.collect(Collectors.toCollection(HashSet::new)))
.orElse(new HashSet<>());
}
} }

View File

@ -0,0 +1,216 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static com.google.common.base.Preconditions.checkArgument;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.io.Serializable;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.DHPUtils;
/**
* Factory class for OpenAIRE identifiers in the Graph
*/
public class IdentifierFactory implements Serializable {
public static final String ID_SEPARATOR = "::";
public static final String ID_PREFIX_SEPARATOR = "|";
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
"(^10\\.1002\\/[^\\s]+$)|" +
"(^10\\.1021\\/[a-zA-Z0-9_][a-zA-Z0-9_][0-9]++$)|" +
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
public static final int ID_PREFIX_LEN = 12;
/**
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE
*/
public static final Map<PidType, HashBiMap<String, String>> PID_AUTHORITY = Maps.newHashMap();
static {
PID_AUTHORITY.put(PidType.doi, HashBiMap.create());
PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref");
PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite");
PID_AUTHORITY.put(PidType.pmc, HashBiMap.create());
PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
PID_AUTHORITY.get(PidType.pmc).put(PUBMED_CENTRAL_ID, "PubMed Central");
PID_AUTHORITY.put(PidType.pmid, HashBiMap.create());
PID_AUTHORITY.get(PidType.pmid).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
PID_AUTHORITY.get(PidType.pmid).put(PUBMED_CENTRAL_ID, "PubMed Central");
PID_AUTHORITY.put(PidType.arXiv, HashBiMap.create());
PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive");
}
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) {
return pidFromInstance(pid, collectedFrom).distinct().collect(Collectors.toList());
}
public static <T extends Result> String createDOIBoostIdentifier(T entity) {
if (entity == null)
return null;
StructuredProperty pid = null;
if (entity.getPid() != null) {
pid = entity
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(s -> s.getQualifier() != null && "doi".equalsIgnoreCase(s.getQualifier().getClassid()))
.filter(CleaningFunctions::pidFilter)
.findAny()
.orElse(null);
} else {
if (entity.getInstance() != null) {
pid = entity
.getInstance()
.stream()
.filter(i -> i.getPid() != null)
.flatMap(i -> i.getPid().stream())
.filter(CleaningFunctions::pidFilter)
.findAny()
.orElse(null);
}
}
if (pid != null)
return idFromPid(entity, pid, true);
return null;
}
/**
* Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given
* entity T. Returns entity.id when none of the PIDs meet the selection criteria is available.
*
* @param entity the entity providing PIDs and a default ID.
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
* @param md5 indicates whether should hash the PID value or not.
* @return an identifier from the most relevant PID, entity.id otherwise
*/
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier");
final Map<String, List<StructuredProperty>> pids = extractPids(entity);
return pids
.values()
.stream()
.flatMap(s -> s.stream())
.min(new PidComparator<>(entity))
.map(
min -> Optional
.ofNullable(pids.get(min.getQualifier().getClassid()))
.map(
p -> p
.stream()
.sorted(new PidValueComparator())
.findFirst()
.map(s -> idFromPid(entity, s, md5))
.orElseGet(entity::getId))
.orElseGet(entity::getId))
.orElseGet(entity::getId);
}
private static <T extends OafEntity> Map<String, List<StructuredProperty>> extractPids(T entity) {
if (entity instanceof Result) {
return Optional
.ofNullable(((Result) entity).getInstance())
.map(
instance -> mapPids(instance))
.orElse(new HashMap<>());
} else {
return entity
.getPid()
.stream()
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(
Collectors
.groupingBy(
p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toList())));
}
}
private static Map<String, List<StructuredProperty>> mapPids(List<Instance> instance) {
return instance
.stream()
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom()))
.flatMap(Function.identity())
.collect(
Collectors
.groupingBy(
p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toList())));
}
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom) {
return Optional
.ofNullable(pid)
.map(
pp -> pp
.stream()
// filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type
.filter(p -> {
return shouldFilterPid(collectedFrom, p);
})
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter))
.orElse(Stream.empty());
}
private static boolean shouldFilterPid(KeyValue collectedFrom, StructuredProperty p) {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
return pType.equals(PidType.handle) || Optional.ofNullable(collectedFrom).isPresent() &&
Optional
.ofNullable(PID_AUTHORITY.get(pType))
.map(authorities -> {
return authorities.containsKey(collectedFrom.getKey())
|| authorities.containsValue(collectedFrom.getValue());
})
.orElse(false);
}
/**
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
*/
public static <T extends OafEntity> String createIdentifier(T entity) {
return createIdentifier(entity, true);
}
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
return new StringBuilder()
.append(ModelSupport.getIdPrefix(entity.getClass()))
.append(ID_PREFIX_SEPARATOR)
.append(createPrefix(s.getQualifier().getClassid()))
.append(ID_SEPARATOR)
.append(md5 ? DHPUtils.md5(s.getValue()) : s.getValue())
.toString();
}
// create the prefix (length = 12)
private static String createPrefix(String pidType) {
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
while (prefix.length() < ID_PREFIX_LEN) {
prefix.append("_");
}
return prefix.substring(0, ID_PREFIX_LEN);
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OrganizationPidComparator implements Comparator<StructuredProperty> {
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
if (lClass.equals(PidType.GRID))
return -1;
if (rClass.equals(PidType.GRID))
return 1;
if (lClass.equals(PidType.mag_id))
return -1;
if (rClass.equals(PidType.mag_id))
return 1;
if (lClass.equals(PidType.urn))
return -1;
if (rClass.equals(PidType.urn))
return 1;
return 0;
}
}

View File

@ -0,0 +1,8 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.HashMap;
import java.util.HashSet;
public class PidBlacklist extends HashMap<String, HashSet<String>> {
}

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.IOException;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
public class PidBlacklistProvider {
private static final PidBlacklist blacklist;
static {
try {
String json = IOUtils.toString(IdentifierFactory.class.getResourceAsStream("pid_blacklist.json"));
blacklist = new ObjectMapper().readValue(json, PidBlacklist.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static PidBlacklist getBlacklist() {
return blacklist;
}
public static Set<String> getBlacklist(String pidType) {
return Optional
.ofNullable(getBlacklist().get(pidType))
.orElse(new HashSet<>());
}
}

View File

@ -0,0 +1,48 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
private T entity;
public PidComparator(T entity) {
this.entity = entity;
}
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
if (ModelSupport.isSubClass(entity, Result.class)) {
return compareResultPids(left, right);
}
if (ModelSupport.isSubClass(entity, Organization.class)) {
return compareOrganizationtPids(left, right);
}
// Else (but unlikely), lexicographical ordering will do.
return left.getQualifier().getClassid().compareTo(right.getQualifier().getClassid());
}
private int compareResultPids(StructuredProperty left, StructuredProperty right) {
return new ResultPidComparator().compare(left, right);
}
private int compareOrganizationtPids(StructuredProperty left, StructuredProperty right) {
return new OrganizationPidComparator().compare(left, right);
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import org.apache.commons.lang3.EnumUtils;
public enum PidType {
// Result
doi, pmid, pmc, handle, arXiv, nct, pdb,
// Organization
GRID, mag_id, urn,
// Used by dedup
undefined, original;
public static boolean isValid(String type) {
return EnumUtils.isValidEnum(PidType.class, type);
}
public static PidType tryValueOf(String s) {
try {
return PidType.valueOf(s);
} catch (Exception e) {
return PidType.original;
}
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.*;
public class PidValueComparator implements Comparator<StructuredProperty> {
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
return Optional
.ofNullable(l.getValue())
.map(
lv -> Optional
.ofNullable(r.getValue())
.map(rv -> lv.compareTo(rv))
.orElse(-1))
.orElse(1);
}
}

View File

@ -0,0 +1,53 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ResultPidComparator implements Comparator<StructuredProperty> {
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
if (lClass.equals(PidType.doi))
return -1;
if (rClass.equals(PidType.doi))
return 1;
if (lClass.equals(PidType.pmid))
return -1;
if (rClass.equals(PidType.pmid))
return 1;
if (lClass.equals(PidType.pmc))
return -1;
if (rClass.equals(PidType.pmc))
return 1;
if (lClass.equals(PidType.handle))
return -1;
if (rClass.equals(PidType.handle))
return 1;
if (lClass.equals(PidType.arXiv))
return -1;
if (rClass.equals(PidType.arXiv))
return 1;
if (lClass.equals(PidType.nct))
return -1;
if (rClass.equals(PidType.nct))
return 1;
if (lClass.equals(PidType.pdb))
return -1;
if (rClass.equals(PidType.pdb))
return 1;
return 0;
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,100 @@
package eu.dnetlib.dhp.oa.merge;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
public class AuthorMergerTest {
private String publicationsBasePath;
private List<List<Author>> authors;
@BeforeEach
public void setUp() throws Exception {
publicationsBasePath = Paths
.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
.toFile()
.getAbsolutePath();
authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
.stream()
.map(p -> p._2().getAuthor())
.collect(Collectors.toList());
}
@Test
public void mergeTest() { // used in the dedup: threshold set to 0.95
for (List<Author> authors1 : authors) {
System.out.println("List " + (authors.indexOf(authors1) + 1));
for (Author author : authors1) {
System.out.println(authorToString(author));
}
}
List<Author> merge = AuthorMerger.merge(authors);
System.out.println("Merge ");
for (Author author : merge) {
System.out.println(authorToString(author));
}
Assertions.assertEquals(7, merge.size());
}
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
List<Tuple2<String, T>> res = new ArrayList<>();
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(path));
String line = reader.readLine();
while (line != null) {
res
.add(
new Tuple2<>(
MapDocumentUtil.getJPathString("$.id", line),
new ObjectMapper().readValue(line, clazz)));
// read next line
line = reader.readLine();
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return res;
}
public String authorToString(Author a) {
String print = "Fullname = ";
print += a.getFullname() + " pid = [";
if (a.getPid() != null)
for (StructuredProperty sp : a.getPid()) {
print += sp.toComparableString() + " ";
}
print += "]";
return print;
}
}

View File

@ -0,0 +1,69 @@
package eu.dnetlib.dhp.schema.oaf;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.jetbrains.annotations.NotNull;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import it.unimi.dsi.fastutil.Hash;
public class OafMapperUtilsTest {
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testMergePubs() throws IOException {
Publication p1 = read("publication_1.json", Publication.class);
Publication p2 = read("publication_2.json", Publication.class);
Dataset d1 = read("dataset_1.json", Dataset.class);
Dataset d2 = read("dataset_2.json", Dataset.class);
assertEquals(p1.getCollectedfrom().size(), 1);
assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
assertEquals(d2.getCollectedfrom().size(), 1);
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertTrue(
OafMapperUtils
.mergeResults(p1, d2)
.getResulttype()
.getClassid()
.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
assertEquals(p2.getCollectedfrom().size(), 1);
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(d1.getCollectedfrom().size(), 1);
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertTrue(
OafMapperUtils
.mergeResults(p2, d1)
.getResulttype()
.getClassid()
.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
}
@NotNull
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
}
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
return OBJECT_MAPPER.readValue(json, clazz);
}
}

View File

@ -0,0 +1,21 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Set;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class BlackListProviderTest {
@Test
public void blackListTest() {
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist());
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist().get("doi"));
Assertions.assertTrue(PidBlacklistProvider.getBlacklist().get("doi").size() > 0);
final Set<String> xxx = PidBlacklistProvider.getBlacklist("xxx");
Assertions.assertNotNull(xxx);
Assertions.assertEquals(0, xxx.size());
}
}

View File

@ -0,0 +1,72 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.utils.DHPUtils;
public class IdentifierFactoryTest {
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testCreateIdentifierForPublication() throws IOException {
verifyIdentifier(
"publication_doi1.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
verifyIdentifier(
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
verifyIdentifier(
"publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
verifyIdentifier(
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
verifyIdentifier(
"publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID, true);
verifyIdentifier("publication_4.json", defaultID, true);
verifyIdentifier("publication_5.json", defaultID, true);
}
@Test
public void testCreateIdentifierForPublicationNoHash() throws IOException {
verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
verifyIdentifier("publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false);
verifyIdentifier(
"publication_urn1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false);
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID, false);
verifyIdentifier("publication_4.json", defaultID, false);
verifyIdentifier("publication_5.json", defaultID, false);
}
protected void verifyIdentifier(String filename, String expectedID, boolean md5) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
String id = IdentifierFactory.createIdentifier(pub, md5);
assertNotNull(id);
assertEquals(expectedID, id);
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"}

View File

@ -0,0 +1,33 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"instance": [
{
"collectedfrom": {
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2",
"value": "Crossref"
},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2010.03.013"
}
]
},
{
"pid": [
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
]
}
]
}

View File

@ -0,0 +1,37 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"instance": [
{
"collectedfrom": {
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2",
"value": "Crossref"
},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2010.03.013"
}
]
},
{
"collectedfrom": {
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central"
},
"pid": [
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
]
}
]
}

View File

@ -0,0 +1,37 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"instance": [
{
"collectedfrom": {
"key": "10|openaire____::1234",
"value": "Zenodo"
},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2010.03.013"
}
]
},
{
"collectedfrom": {
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central"
},
"pid": [
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
]
}
]
}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}

View File

@ -0,0 +1,21 @@
{
"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"instance": [
{
"collectedfrom": {
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central"
},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2010.03.013"
},
{
"qualifier":{"classid":"pmc"},
"value":"21459329"
}
]
}
]
}

View File

@ -0,0 +1,23 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"pid": [
{
"qualifier": {
"classid": "urn"
},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {
"classid": "scp-number"
},
"value": "79953761260"
},
{
"qualifier": {
"classid": "pmcid"
},
"value": "21459329"
}
]
}

View File

@ -3,12 +3,13 @@ package eu.dnetlib.dhp.schema.common;
import java.util.Comparator; import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.AccessRight;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class LicenseComparator implements Comparator<Qualifier> { public class AccessRightComparator<T extends Qualifier> implements Comparator<T> {
@Override @Override
public int compare(Qualifier left, Qualifier right) { public int compare(T left, T right) {
if (left == null && right == null) if (left == null && right == null)
return 0; return 0;

View File

@ -11,13 +11,24 @@ public class ModelConstants {
public static final String ORCID_PENDING = "orcid_pending"; public static final String ORCID_PENDING = "orcid_pending";
public static final String ORCID_CLASSNAME = "Open Researcher and Contributor ID"; public static final String ORCID_CLASSNAME = "Open Researcher and Contributor ID";
public static final String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
public static final String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
public static final String EUROPE_PUBMED_CENTRAL_ID = "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c";
public static final String PUBMED_CENTRAL_ID = "10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357";
public static final String ARXIV_ID = "10|opendoar____::6f4922f45568161a8cdf4ad2299f6d23";
// VOCABULARY VALUE
public static final String ACCESS_RIGHT_OPEN = "OPEN";
public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies"; public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies";
public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies"; public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies";
public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource"; public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource";
public static final String DNET_ACCESS_MODES = "dnet:access_modes"; public static final String DNET_ACCESS_MODES = "dnet:access_modes";
public static final String DNET_LANGUAGES = "dnet:languages"; public static final String DNET_LANGUAGES = "dnet:languages";
public static final String DNET_PID_TYPES = "dnet:pid_types"; public static final String DNET_PID_TYPES = "dnet:pid_types";
public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date"; public static final String DNET_DATACITE_DATE = "dnet:dataCite_date";
public static final String DNET_DATACITE_TITLE = "dnet:dataCite_title";
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource"; public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
public static final String DNET_COUNTRY_TYPE = "dnet:countries"; public static final String DNET_COUNTRY_TYPE = "dnet:countries";
@ -105,6 +116,11 @@ public class ModelConstants {
public static final KeyValue UNKNOWN_REPOSITORY = keyValue( public static final KeyValue UNKNOWN_REPOSITORY = keyValue(
"10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository"); "10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository");
public static final Qualifier UNKNOWN_COUNTRY = qualifier(UNKNOWN, "Unknown", DNET_COUNTRY_TYPE, DNET_COUNTRY_TYPE);
public static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
"main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE);
private static Qualifier qualifier( private static Qualifier qualifier(
final String classid, final String classid,
final String classname, final String classname,

View File

@ -3,6 +3,12 @@ package eu.dnetlib.dhp.schema.common;
import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkArgument;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.time.temporal.TemporalAccessor;
import java.util.Date;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
@ -375,6 +381,8 @@ public class ModelSupport {
Field.class, Field.class,
GeoLocation.class, GeoLocation.class,
Instance.class, Instance.class,
AccessRight.class,
OpenAccessRoute.class,
Journal.class, Journal.class,
KeyValue.class, KeyValue.class,
Oaf.class, Oaf.class,
@ -473,4 +481,23 @@ public class ModelSupport {
private static <T extends Oaf> String idFnForOafEntity(T t) { private static <T extends Oaf> String idFnForOafEntity(T t) {
return ((OafEntity) t).getId(); return ((OafEntity) t).getId();
} }
public static String oldest(String dateA, String dateB) throws ParseException {
if (StringUtils.isBlank(dateA)) {
return dateB;
}
if (StringUtils.isBlank(dateB)) {
return dateA;
}
if (StringUtils.isNotBlank(dateA) && StringUtils.isNotBlank(dateB)) {
final Date a = Date.from(Instant.from(DateTimeFormatter.ISO_INSTANT.parse(dateA)));
final Date b = Date.from(Instant.from(DateTimeFormatter.ISO_INSTANT.parse(dateB)));
return a.before(b) ? dateA : dateB;
} else {
return null;
}
}
} }

View File

@ -0,0 +1,48 @@
package eu.dnetlib.dhp.schema.oaf;
import java.util.Optional;
/**
* This class models the access rights of research products.
*/
public class AccessRight extends Qualifier {
private OpenAccessRoute openAccessRoute;
public OpenAccessRoute getOpenAccessRoute() {
return openAccessRoute;
}
public void setOpenAccessRoute(OpenAccessRoute openAccessRoute) {
this.openAccessRoute = openAccessRoute;
}
public String toComparableString() {
String s = super.toComparableString();
return Optional
.ofNullable(getOpenAccessRoute())
.map(x -> s + "::" + x.toString())
.orElse(s);
}
@Override
public int hashCode() {
return toComparableString().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Qualifier other = (Qualifier) obj;
return toComparableString().equals(other.toComparableString());
}
}

View File

@ -12,6 +12,7 @@ public class Author implements Serializable {
private String surname; private String surname;
// START WITH 1
private Integer rank; private Integer rank;
private List<StructuredProperty> pid; private List<StructuredProperty> pid;

View File

@ -8,7 +8,7 @@ public class Instance implements Serializable {
private Field<String> license; private Field<String> license;
private Qualifier accessright; private AccessRight accessright;
private Qualifier instancetype; private Qualifier instancetype;
@ -21,6 +21,10 @@ public class Instance implements Serializable {
private KeyValue collectedfrom; private KeyValue collectedfrom;
private List<StructuredProperty> pid;
private List<StructuredProperty> alternateIdentifier;
private Field<String> dateofacceptance; private Field<String> dateofacceptance;
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed // ( article | book ) processing charges. Defined here to cope with possible wrongly typed
@ -41,11 +45,11 @@ public class Instance implements Serializable {
this.license = license; this.license = license;
} }
public Qualifier getAccessright() { public AccessRight getAccessright() {
return accessright; return accessright;
} }
public void setAccessright(Qualifier accessright) { public void setAccessright(AccessRight accessright) {
this.accessright = accessright; this.accessright = accessright;
} }
@ -89,6 +93,14 @@ public class Instance implements Serializable {
this.collectedfrom = collectedfrom; this.collectedfrom = collectedfrom;
} }
public List<StructuredProperty> getPid() {
return pid;
}
public void setPid(List<StructuredProperty> pid) {
this.pid = pid;
}
public Field<String> getDateofacceptance() { public Field<String> getDateofacceptance() {
return dateofacceptance; return dateofacceptance;
} }
@ -97,6 +109,14 @@ public class Instance implements Serializable {
this.dateofacceptance = dateofacceptance; this.dateofacceptance = dateofacceptance;
} }
public List<StructuredProperty> getAlternateIdentifier() {
return alternateIdentifier;
}
public void setAlternateIdentifier(List<StructuredProperty> alternateIdentifier) {
this.alternateIdentifier = alternateIdentifier;
}
public Field<String> getProcessingchargeamount() { public Field<String> getProcessingchargeamount() {
return processingchargeamount; return processingchargeamount;
} }
@ -149,4 +169,5 @@ public class Instance implements Serializable {
return toComparableString().equals(other.toComparableString()); return toComparableString().equals(other.toComparableString());
} }
} }

View File

@ -62,8 +62,6 @@ public abstract class Oaf implements Serializable {
.distinct() // relies on KeyValue.equals .distinct() // relies on KeyValue.equals
.collect(Collectors.toList())); .collect(Collectors.toList()));
mergeOAFDataInfo(o);
setLastupdatetimestamp( setLastupdatetimestamp(
Math Math
.max( .max(

View File

@ -0,0 +1,13 @@
package eu.dnetlib.dhp.schema.oaf;
/**
* This Enum models the OpenAccess status, currently including only the values from Unpaywall
*
* https://support.unpaywall.org/support/solutions/articles/44001777288-what-do-the-types-of-oa-status-green-gold-hybrid-and-bronze-mean-
*/
public enum OpenAccessRoute {
gold, green, hybrid, bronze
}

View File

@ -351,8 +351,6 @@ public class Project extends OafEntity implements Serializable {
? p.getFundedamount() ? p.getFundedamount()
: fundedamount; : fundedamount;
// programme = mergeLists(programme, p.getProgramme());
h2020classification = mergeLists(h2020classification, p.getH2020classification()); h2020classification = mergeLists(h2020classification, p.getH2020classification());
mergeOAFDataInfo(e); mergeOAFDataInfo(e);

View File

@ -3,10 +3,13 @@ package eu.dnetlib.dhp.schema.oaf;
import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkArgument;
import java.text.ParseException;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import eu.dnetlib.dhp.schema.common.ModelSupport;
/** /**
* Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to * Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to
* graph node identifiers and it is further characterised by the semantic of the link through the fields relType, * graph node identifiers and it is further characterised by the semantic of the link through the fields relType,
@ -106,7 +109,7 @@ public class Relation extends Oaf {
} }
public Boolean getValidated() { public Boolean getValidated() {
return validated; return Objects.nonNull(validated) && validated;
} }
public void setValidated(Boolean validated) { public void setValidated(Boolean validated) {
@ -130,6 +133,16 @@ public class Relation extends Oaf {
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal"); Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal"); checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
setValidated(getValidated() || r.getValidated());
try {
setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate()));
} catch (ParseException e) {
throw new IllegalArgumentException(String
.format(
"invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(),
getValidationDate()));
}
super.mergeFrom(r); super.mergeFrom(r);
} }

View File

@ -2,12 +2,11 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.LicenseComparator; import eu.dnetlib.dhp.schema.common.AccessRightComparator;
public class Result extends OafEntity implements Serializable { public class Result extends OafEntity implements Serializable {
@ -243,12 +242,12 @@ public class Result extends OafEntity implements Serializable {
Result r = (Result) e; Result r = (Result) e;
// TODO consider merging also Measures measures = mergeLists(measures, r.getMeasures());
instance = mergeLists(instance, r.getInstance()); instance = mergeLists(instance, r.getInstance());
if (r.getBestaccessright() != null if (r.getBestaccessright() != null
&& new LicenseComparator().compare(r.getBestaccessright(), bestaccessright) < 0) && new AccessRightComparator().compare(r.getBestaccessright(), bestaccessright) < 0)
bestaccessright = r.getBestaccessright(); bestaccessright = r.getBestaccessright();
if (r.getResulttype() != null && compareTrust(this, r) < 0) if (r.getResulttype() != null && compareTrust(this, r) < 0)
@ -323,13 +322,13 @@ public class Result extends OafEntity implements Serializable {
if (a.size() == b.size()) { if (a.size() == b.size()) {
int msa = a int msa = a
.stream() .stream()
.filter(i -> i.getValue() != null) .filter(i -> i != null && i.getValue() != null)
.map(i -> i.getValue().length()) .map(i -> i.getValue().length())
.max(Comparator.naturalOrder()) .max(Comparator.naturalOrder())
.orElse(0); .orElse(0);
int msb = b int msb = b
.stream() .stream()
.filter(i -> i.getValue() != null) .filter(i -> i != null && i.getValue() != null)
.map(i -> i.getValue().length()) .map(i -> i.getValue().length())
.max(Comparator.naturalOrder()) .max(Comparator.naturalOrder())
.orElse(0); .orElse(0);

View File

@ -2,6 +2,13 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable; import java.io.Serializable;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner;
public class StructuredProperty implements Serializable { public class StructuredProperty implements Serializable {
@ -36,7 +43,12 @@ public class StructuredProperty implements Serializable {
} }
public String toComparableString() { public String toComparableString() {
return value != null ? value.toLowerCase() : ""; return Stream
.of(
getQualifier().toComparableString(),
Optional.ofNullable(getValue()).map(String::toLowerCase).orElse(""))
.filter(StringUtils::isNotBlank)
.collect(Collectors.joining("||"));
} }
@Override @Override

View File

@ -1,6 +1,6 @@
package eu.dnetlib.dhp.schema.scholexplorer package eu.dnetlib.dhp.schema.scholexplorer
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Field, KeyValue, Qualifier, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{AccessRight, DataInfo, Field, KeyValue, Qualifier, StructuredProperty}
object OafUtils { object OafUtils {
@ -39,6 +39,15 @@ object OafUtils {
q q
} }
def createAccessRight(classId: String, className: String, schemeId: String, schemeName: String): AccessRight = {
val accessRight: AccessRight = new AccessRight
accessRight.setClassid(classId)
accessRight.setClassname(className)
accessRight.setSchemeid(schemeId)
accessRight.setSchemename(schemeName)
accessRight
}
def asField[T](value: T): Field[T] = { def asField[T](value: T): Field[T] = {
val tmp = new Field[T] val tmp = new Field[T]

View File

@ -63,6 +63,56 @@ public class MergeTest {
assertEquals(3, a.getSubject().size()); assertEquals(3, a.getSubject().size());
} }
@Test
public void mergeRelationTest() {
Relation a = createRel(null, null);
Relation b = createRel(null, null);
a.mergeFrom(b);
assertEquals(a, b);
a = createRel(true, null);
b = createRel(null, null);
a.mergeFrom(b);
assertEquals(true, a.getValidated());
a = createRel(true, null);
b = createRel(false, null);
a.mergeFrom(b);
assertEquals(true, a.getValidated());
a = createRel(true, null);
b = createRel(true, "2016-04-05T12:41:19.202Z");
a.mergeFrom(b);
assertEquals("2016-04-05T12:41:19.202Z", a.getValidationDate());
a = createRel(true, "2016-05-07T12:41:19.202Z");
b = createRel(true, "2016-04-05T12:41:19.202Z");
a.mergeFrom(b);
assertEquals("2016-04-05T12:41:19.202Z", a.getValidationDate());
}
@Test
public void mergeRelationTestParseException() {
assertThrows(IllegalArgumentException.class, () -> {
Relation a = createRel(true, "2016-04-05");
Relation b = createRel(true, "2016-04-05");
a.mergeFrom(b);
});
}
private Relation createRel(Boolean validated, String validationDate) {
Relation rel = new Relation();
rel.setSource("1");
rel.setTarget("2");
rel.setRelType("reltype");
rel.setSubRelType("subreltype");
rel.setRelClass("relclass");
rel.setValidated(validated);
rel.setValidationDate(validationDate);
return rel;
}
private KeyValue setKV(final String key, final String value) { private KeyValue setKV(final String key, final String value) {
KeyValue k = new KeyValue(); KeyValue k = new KeyValue();

View File

@ -82,7 +82,7 @@ public class ProtoConverter implements Serializable {
private static Instance convertInstance(ResultProtos.Result.Instance ri) { private static Instance convertInstance(ResultProtos.Result.Instance ri) {
final Instance i = new Instance(); final Instance i = new Instance();
i.setAccessright(mapQualifier(ri.getAccessright())); i.setAccessright(mapAccessRight(ri.getAccessright()));
i.setCollectedfrom(mapKV(ri.getCollectedfrom())); i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
i.setDateofacceptance(mapStringField(ri.getDateofacceptance())); i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
i.setDistributionlocation(ri.getDistributionlocation()); i.setDistributionlocation(ri.getDistributionlocation());
@ -510,7 +510,7 @@ public class ProtoConverter implements Serializable {
.map(i -> i.getAccessright()) .map(i -> i.getAccessright())
.min(new LicenseComparator()); .min(new LicenseComparator());
final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier(); final Qualifier rights = min.isPresent() ? mapAccessRight(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) { if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN); rights.setClassid(UNKNOWN);
@ -579,6 +579,15 @@ public class ProtoConverter implements Serializable {
return qualifier; return qualifier;
} }
public static AccessRight mapAccessRight(FieldTypeProtos.Qualifier q) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(q.getClassid());
accessRight.setClassname(q.getClassname());
accessRight.setSchemeid(q.getSchemeid());
accessRight.setSchemename(q.getSchemename());
return accessRight;
}
public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) { public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
final Country c = new Country(); final Country c = new Country();
c.setClassid(q.getClassid()); c.setClassid(q.getClassid());

View File

@ -68,6 +68,12 @@ public class PromoteActionPayloadForGraphTableJob {
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase()); MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
logger.info("strategy: {}", strategy); logger.info("strategy: {}", strategy);
Boolean shouldGroupById = Optional
.ofNullable(parser.get("shouldGroupById"))
.map(Boolean::valueOf)
.orElse(true);
logger.info("shouldGroupById: {}", shouldGroupById);
Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName); Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
Class<? extends Oaf> actionPayloadClazz = (Class<? extends Oaf>) Class.forName(actionPayloadClassName); Class<? extends Oaf> actionPayloadClazz = (Class<? extends Oaf>) Class.forName(actionPayloadClassName);
@ -89,7 +95,8 @@ public class PromoteActionPayloadForGraphTableJob {
outputGraphTablePath, outputGraphTablePath,
strategy, strategy,
rowClazz, rowClazz,
actionPayloadClazz); actionPayloadClazz,
shouldGroupById);
}); });
} }
@ -115,12 +122,12 @@ public class PromoteActionPayloadForGraphTableJob {
String outputGraphTablePath, String outputGraphTablePath,
MergeAndGet.Strategy strategy, MergeAndGet.Strategy strategy,
Class<G> rowClazz, Class<G> rowClazz,
Class<A> actionPayloadClazz) { Class<A> actionPayloadClazz, Boolean shouldGroupById) {
Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz); Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz);
Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz); Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz);
Dataset<G> result = promoteActionPayloadForGraphTable( Dataset<G> result = promoteActionPayloadForGraphTable(
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz) rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById)
.map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz)); .map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz));
saveGraphTable(result, outputGraphTablePath); saveGraphTable(result, outputGraphTablePath);
@ -174,7 +181,8 @@ public class PromoteActionPayloadForGraphTableJob {
Dataset<A> actionPayloadDS, Dataset<A> actionPayloadDS,
MergeAndGet.Strategy strategy, MergeAndGet.Strategy strategy,
Class<G> rowClazz, Class<G> rowClazz,
Class<A> actionPayloadClazz) { Class<A> actionPayloadClazz,
Boolean shouldGroupById) {
logger logger
.info( .info(
"Promoting action payload for graph table: payload={}, table={}", "Promoting action payload for graph table: payload={}, table={}",
@ -198,9 +206,13 @@ public class PromoteActionPayloadForGraphTableJob {
rowClazz, rowClazz,
actionPayloadClazz); actionPayloadClazz);
return PromoteActionPayloadFunctions if (shouldGroupById) {
.groupGraphTableByIdAndMerge( return PromoteActionPayloadFunctions
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); .groupGraphTableByIdAndMerge(
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
} else {
return joinedAndMerged;
}
} }
private static <T extends Oaf> SerializableSupplier<T> zeroFn(Class<T> clazz) { private static <T extends Oaf> SerializableSupplier<T> zeroFn(Class<T> clazz) {

View File

@ -40,5 +40,11 @@
"paramLongName": "mergeAndGetStrategy", "paramLongName": "mergeAndGetStrategy",
"paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET", "paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET",
"paramRequired": true "paramRequired": true
},
{
"paramName": "sgid",
"paramLongName": "shouldGroupById",
"paramDescription": "indicates whether the promotion operation should group objects in the graph by id or not",
"paramRequired": false
} }
] ]

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -111,6 +115,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg> <arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="DecisionPromoteResultActionPayloadForDatasetTable"/> <ok to="DecisionPromoteResultActionPayloadForDatasetTable"/>
<error to="Kill"/> <error to="Kill"/>
@ -162,6 +167,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg> <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -56,6 +56,11 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<value>false</value>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -110,6 +114,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg> <arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/> <ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/>
<error to="Kill"/> <error to="Kill"/>
@ -161,6 +166,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg> <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -111,6 +115,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg> <arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="DecisionPromoteResultActionPayloadForPublicationTable"/> <ok to="DecisionPromoteResultActionPayloadForPublicationTable"/>
<error to="Kill"/> <error to="Kill"/>
@ -162,6 +167,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg> <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -110,6 +114,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg> <arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/> <ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/>
<error to="Kill"/> <error to="Kill"/>
@ -161,6 +166,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg> <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -101,7 +101,9 @@ public class PromoteActionPayloadForGraphTableJobTest {
"-outputGraphTablePath", "-outputGraphTablePath",
"", "",
"-mergeAndGetStrategy", "-mergeAndGetStrategy",
MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name() MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name(),
"--shouldGroupById",
"true"
})); }));
// then // then
@ -141,7 +143,9 @@ public class PromoteActionPayloadForGraphTableJobTest {
"-outputGraphTablePath", "-outputGraphTablePath",
outputGraphTableDir.toString(), outputGraphTableDir.toString(),
"-mergeAndGetStrategy", "-mergeAndGetStrategy",
strategy.name() strategy.name(),
"--shouldGroupById",
"true"
}); });
// then // then

View File

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* Class that maps the model of the bipFinder! input data.
* Only needed for deserialization purposes
*/
public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
public BipDeserialize() {
super();
}
public List<Score> get(String key) {
if (super.get(key) == null) {
return new ArrayList<>();
}
return super.get(key);
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
import java.util.List;
/**
* Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
*/
public class BipScore implements Serializable {
private String id; // doi
private List<Score> scoreList; // unit as given in the inputfile
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<Score> getScoreList() {
return scoreList;
}
public void setScoreList(List<Score> scoreList) {
this.scoreList = scoreList;
}
}

View File

@ -0,0 +1,85 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.Result;
/**
* Just collects all the atomic actions produced for the different results and saves them in
* outputpath for the ActionSet
*/
public class CollectAndSave implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CollectAndSave.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
collectAndSave(spark, inputPath, outputPath);
});
}
private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
sc
.sequenceFile(inputPath + "/publication", Text.class, Text.class)
.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
;
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
public class KeyValue implements Serializable {
private String key;
private String value;
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
/**
* Subset of the information of the generic results that are needed to create the atomic action
*/
public class PreparedResult implements Serializable {
private String id; // openaire id
private String value; // doi
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
import java.util.List;
/**
* represents the score in the input file
*/
public class Score implements Serializable {
private String id;
private List<KeyValue> unit;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<KeyValue> getUnit() {
return unit;
}
public void setUnit(List<KeyValue> unit) {
this.unit = unit;
}
}

View File

@ -0,0 +1,200 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import scala.Tuple2;
/**
* created the Atomic Action for each tipe of results
*/
public class SparkAtomicActionScoreJob implements Serializable {
private static String DOI = "doi";
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkAtomicActionScoreJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String bipScorePath = parser.get("bipScorePath");
log.info("bipScorePath: {}", bipScorePath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
prepareResults(spark, inputPath, outputPath, bipScorePath, inputClazz);
});
}
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
String bipScorePath, Class<I> inputClazz) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(bipScorePath)
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
Dataset<BipScore> bipScores = spark
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
BipScore bs = new BipScore();
bs.setId(key);
bs.setScoreList(entry.get(key));
return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
System.out.println(bipScores.count());
Dataset<I> results = readPath(spark, inputPath, inputClazz);
results.createOrReplaceTempView("result");
Dataset<PreparedResult> preparedResult = spark
.sql(
"select pIde.value value, id " +
"from result " +
"lateral view explode (pid) p as pIde " +
"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
.as(Encoders.bean(PreparedResult.class));
bipScores
.joinWith(
preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
"inner")
.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
BipScore ret = value._1();
ret.setId(value._2().getId());
return ret;
}, Encoders.bean(BipScore.class))
.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
Result ret = new Result();
ret.setDataInfo(getDataInfo());
BipScore first = it.next();
ret.setId(first.getId());
ret.setMeasures(getMeasure(first));
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
return ret;
}, Encoders.bean(Result.class))
.toJavaRDD()
.map(p -> new AtomicAction(inputClazz, p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
private static List<Measure> getMeasure(BipScore value) {
return value
.getScoreList()
.stream()
.map(score -> {
Measure m = new Measure();
m.setId(score.getId());
m
.setUnit(
score
.getUnit()
.stream()
.map(unit -> {
KeyValue kv = new KeyValue();
kv.setValue(unit.getValue());
kv.setKey(unit.getKey());
kv.setDataInfo(getDataInfo());
return kv;
})
.collect(Collectors.toList()));
return m;
})
.collect(Collectors.toList());
}
private static DataInfo getDataInfo() {
DataInfo di = new DataInfo();
di.setInferred(false);
di.setInvisible(false);
di.setDeletedbyinference(false);
di.setTrust("");
Qualifier qualifier = new Qualifier();
qualifier.setClassid("sysimport:actionset");
qualifier.setClassname("Harvested");
qualifier.setSchemename("dnet:provenanceActions");
qualifier.setSchemeid("dnet:provenanceActions");
di.setProvenanceaction(qualifier);
return di;
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

View File

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "rtn",
"paramLongName": "resultTableName",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "bsp",
"paramLongName": "bipScorePath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,171 @@
<workflow-app name="BipFinderScore" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>inputPath</name>
<description>the input path of the resources to be extended</description>
</property>
<property>
<name>bipScorePath</name>
<description>the path where to find the bipFinder scores</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
</property>
</parameters>
<start to="deleteoutputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="deleteoutputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="atomicactions"/>
<error to="Kill"/>
</action>
<fork name="atomicactions">
<path start="atomicactions_publication"/>
<path start="atomicactions_dataset"/>
<path start="atomicactions_orp"/>
<path start="atomicactions_software"/>
</fork>
<action name="atomicactions_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for publications</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<action name="atomicactions_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for datasets</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<action name="atomicactions_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for orp</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<action name="atomicactions_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for software</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<join name="join_aa" to="collectandsave"/>
<action name="collectandsave">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>saves all the aa produced for the several types of results in the as output path</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,323 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Publication;
public class SparkAtomicActionScoreJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(SparkAtomicActionScoreJobTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(SparkAtomicActionScoreJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(SparkAtomicActionScoreJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(SparkAtomicActionScoreJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void matchOne() throws Exception {
String bipScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
.getPath();
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
.getPath();
SparkAtomicActionScoreJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-bipScorePath",
bipScoresPath,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Publication) aa.getPayload()));
Assertions.assertTrue(tmp.count() == 1);
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
Dataset<Row> execVerification = spark
.sql(
"Select p.id oaid, mes.id, mUnit.value from publication p " +
"lateral view explode(measures) m as mes " +
"lateral view explode(mes.unit) u as mUnit ");
Assertions.assertEquals(2, execVerification.count());
Assertions
.assertEquals(
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
execVerification.select("oaid").collectAsList().get(0).getString(0));
Assertions
.assertEquals(
"1.47565045883e-08",
execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
Assertions
.assertEquals(
"0.227515392",
execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
}
@Test
public void matchOneWithTwo() throws Exception {
String bipScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
.getPath();
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
.getPath();
SparkAtomicActionScoreJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-bipScorePath",
bipScoresPath,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Publication) aa.getPayload()));
Assertions.assertTrue(tmp.count() == 1);
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
Dataset<Row> execVerification = spark
.sql(
"Select p.id oaid, mes.id, mUnit.value from publication p " +
"lateral view explode(measures) m as mes " +
"lateral view explode(mes.unit) u as mUnit ");
Assertions.assertEquals(4, execVerification.count());
Assertions
.assertEquals(
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
execVerification.select("oaid").collectAsList().get(0).getString(0));
Assertions
.assertEquals(
2,
execVerification.filter("id = 'influence'").count());
Assertions
.assertEquals(
2,
execVerification.filter("id = 'popularity'").count());
List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
String tmp_influence = tmp_ds.get(0).getString(0);
Assertions
.assertTrue(
"1.47565045883e-08".equals(tmp_influence) ||
"1.98956540239e-08".equals(tmp_influence));
tmp_influence = tmp_ds.get(1).getString(0);
Assertions
.assertTrue(
"1.47565045883e-08".equals(tmp_influence) ||
"1.98956540239e-08".equals(tmp_influence));
Assertions.assertTrue(!tmp_ds.get(0).getString(0).equals(tmp_ds.get(1).getString(0)));
}
@Test
public void matchTwo() throws Exception {
String bipScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
.getPath();
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
.getPath();
SparkAtomicActionScoreJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-bipScorePath",
bipScoresPath,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Publication) aa.getPayload()));
Assertions.assertTrue(tmp.count() == 2);
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
Dataset<Row> execVerification = spark
.sql(
"Select p.id oaid, mes.id, mUnit.value from publication p " +
"lateral view explode(measures) m as mes " +
"lateral view explode(mes.unit) u as mUnit ");
Assertions.assertEquals(4, execVerification.count());
Assertions
.assertEquals(
2,
execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
Assertions
.assertEquals(
2,
execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
Assertions
.assertEquals(
2,
execVerification.filter("id = 'influence'").count());
Assertions
.assertEquals(
2,
execVerification.filter("id = 'popularity'").count());
Assertions
.assertEquals(
"1.47565045883e-08",
execVerification
.filter(
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
"and id = 'influence'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
Assertions
.assertEquals(
"1.98956540239e-08",
execVerification
.filter(
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
"and id = 'influence'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
Assertions
.assertEquals(
"0.282046161584",
execVerification
.filter(
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
"and id = 'popularity'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
Assertions
.assertEquals(
"0.227515392",
execVerification
.filter(
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
"and id = 'popularity'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -32,15 +32,15 @@ public class CheckDuplictedIdsJob {
IOUtils IOUtils
.toString( .toString(
CheckDuplictedIdsJob.class CheckDuplictedIdsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/check_duplicates.json")));
parser.parseArgument(args); parser.parseArgument(args);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events"; final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath); log.info("eventsPath: {}", eventsPath);
final String countPath = parser.get("workingPath") + "/counts"; final String countPath = parser.get("outputDir") + "/counts";
log.info("countPath: {}", countPath); log.info("countPath: {}", countPath);
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
@ -59,6 +59,7 @@ public class CheckDuplictedIdsJob {
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG())) .map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(countPath); .json(countPath);
; ;

View File

@ -44,10 +44,10 @@ public class GenerateEventsJob {
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String eventsPath = workingPath + "/events"; final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath); log.info("eventsPath: {}", eventsPath);
final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist"); final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
@ -59,6 +59,9 @@ public class GenerateEventsJob {
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist"); final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ",")); log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
final Set<String> topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist");
log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ","));
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> { runWithSparkSession(conf, isSparkSessionManaged, spark -> {
@ -70,12 +73,12 @@ public class GenerateEventsJob {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_events"); final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");
final Dataset<ResultGroup> groups = ClusterUtils final Dataset<ResultGroup> groups = ClusterUtils
.readPath(spark, workingPath + "/duplicates", ResultGroup.class); .readPath(spark, workingDir + "/duplicates", ResultGroup.class);
final Dataset<Event> dataset = groups final Dataset<Event> dataset = groups
.map( .map(
g -> EventFinder g -> EventFinder
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators), .generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
Encoders Encoders
.bean(EventGroup.class)) .bean(EventGroup.class))
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class)); .flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));

View File

@ -46,7 +46,7 @@ public class GenerateStatsJob {
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events"; final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath); log.info("eventsPath: {}", eventsPath);
final String dbUrl = parser.get("dbUrl"); final String dbUrl = parser.get("dbUrl");

View File

@ -46,7 +46,7 @@ public class IndexEventSubsetJob {
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events"; final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath); log.info("eventsPath: {}", eventsPath);
final String index = parser.get("index"); final String index = parser.get("index");
@ -55,6 +55,18 @@ public class IndexEventSubsetJob {
final String indexHost = parser.get("esHost"); final String indexHost = parser.get("esHost");
log.info("indexHost: {}", indexHost); log.info("indexHost: {}", indexHost);
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
final String esNodesWanOnly = parser.get("esNodesWanOnly");
log.info("esNodesWanOnly: {}", esNodesWanOnly);
final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic")); final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic"));
log.info("maxEventsForTopic: {}", maxEventsForTopic); log.info("maxEventsForTopic: {}", maxEventsForTopic);
@ -86,10 +98,10 @@ public class IndexEventSubsetJob {
esCfg.put("es.index.auto.create", "false"); esCfg.put("es.index.auto.create", "false");
esCfg.put("es.nodes", indexHost); esCfg.put("es.nodes", indexHost);
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
esCfg.put("es.batch.write.retry.count", "8"); esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
esCfg.put("es.batch.write.retry.wait", "60s"); esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
esCfg.put("es.batch.size.entries", "200"); esCfg.put("es.batch.size.entries", esBatchSizeEntries);
esCfg.put("es.nodes.wan.only", "true"); esCfg.put("es.nodes.wan.only", esNodesWanOnly);
log.info("*** Start indexing"); log.info("*** Start indexing");
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);

View File

@ -54,7 +54,7 @@ public class IndexNotificationsJob {
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events"; final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath); log.info("eventsPath: {}", eventsPath);
final String index = parser.get("index"); final String index = parser.get("index");
@ -63,6 +63,18 @@ public class IndexNotificationsJob {
final String indexHost = parser.get("esHost"); final String indexHost = parser.get("esHost");
log.info("indexHost: {}", indexHost); log.info("indexHost: {}", indexHost);
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
final String esNodesWanOnly = parser.get("esNodesWanOnly");
log.info("esNodesWanOnly: {}", esNodesWanOnly);
final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl"); final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl); log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);
@ -92,10 +104,10 @@ public class IndexNotificationsJob {
esCfg.put("es.index.auto.create", "false"); esCfg.put("es.index.auto.create", "false");
esCfg.put("es.nodes", indexHost); esCfg.put("es.nodes", indexHost);
esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
esCfg.put("es.batch.write.retry.count", "8"); esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
esCfg.put("es.batch.write.retry.wait", "60s"); esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
esCfg.put("es.batch.size.entries", "200"); esCfg.put("es.batch.size.entries", esBatchSizeEntries);
esCfg.put("es.nodes.wan.only", "true"); esCfg.put("es.nodes.wan.only", esNodesWanOnly);
log.info("*** Start indexing"); log.info("*** Start indexing");
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);

View File

@ -36,7 +36,7 @@ public class IndexOnESJob {
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events"; final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath); log.info("eventsPath: {}", eventsPath);
final String index = parser.get("index"); final String index = parser.get("index");
@ -45,6 +45,18 @@ public class IndexOnESJob {
final String indexHost = parser.get("esHost"); final String indexHost = parser.get("esHost");
log.info("indexHost: {}", indexHost); log.info("indexHost: {}", indexHost);
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
final String esNodesWanOnly = parser.get("esNodesWanOnly");
log.info("esNodesWanOnly: {}", esNodesWanOnly);
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
final JavaRDD<String> inputRdd = ClusterUtils final JavaRDD<String> inputRdd = ClusterUtils
@ -53,15 +65,13 @@ public class IndexOnESJob {
.javaRDD(); .javaRDD();
final Map<String, String> esCfg = new HashMap<>(); final Map<String, String> esCfg = new HashMap<>();
// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
esCfg.put("es.index.auto.create", "false"); esCfg.put("es.index.auto.create", "false");
esCfg.put("es.nodes", indexHost); esCfg.put("es.nodes", indexHost);
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
esCfg.put("es.batch.write.retry.count", "8"); esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
esCfg.put("es.batch.write.retry.wait", "60s"); esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
esCfg.put("es.batch.size.entries", "200"); esCfg.put("es.batch.size.entries", esBatchSizeEntries);
esCfg.put("es.nodes.wan.only", "true"); esCfg.put("es.nodes.wan.only", esNodesWanOnly);
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
} }

View File

@ -42,10 +42,10 @@ public class JoinStep0Job {
final String graphPath = parser.get("graphPath"); final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath); log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step0"; final String joinedEntitiesPath = workingDir + "/joinedEntities_step0";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath); log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -57,10 +57,10 @@ public class JoinStep0Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class); .readPath(spark, workingDir + "/simpleEntities", OaBrokerMainEntity.class);
final Dataset<RelatedDatasource> typedRels = ClusterUtils final Dataset<RelatedDatasource> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class); .readPath(spark, workingDir + "/relatedDatasources", RelatedDatasource.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator() final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
.toColumn(); .toColumn();

View File

@ -40,10 +40,10 @@ public class JoinStep1Job {
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step1"; final String joinedEntitiesPath = workingDir + "/joinedEntities_step1";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath); log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep1Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class); .readPath(spark, workingDir + "/joinedEntities_step0", OaBrokerMainEntity.class);
final Dataset<RelatedProject> typedRels = ClusterUtils final Dataset<RelatedProject> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class); .readPath(spark, workingDir + "/relatedProjects", RelatedProject.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator() final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
.toColumn(); .toColumn();

View File

@ -39,10 +39,10 @@ public class JoinStep2Job {
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step2"; final String joinedEntitiesPath = workingDir + "/joinedEntities_step2";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath); log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -54,10 +54,10 @@ public class JoinStep2Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class); .readPath(spark, workingDir + "/joinedEntities_step1", OaBrokerMainEntity.class);
final Dataset<RelatedSoftware> typedRels = ClusterUtils final Dataset<RelatedSoftware> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class); .readPath(spark, workingDir + "/relatedSoftwares", RelatedSoftware.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator() final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
.toColumn(); .toColumn();

View File

@ -40,10 +40,10 @@ public class JoinStep3Job {
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step3"; final String joinedEntitiesPath = workingDir + "/joinedEntities_step3";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath); log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep3Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class); .readPath(spark, workingDir + "/joinedEntities_step2", OaBrokerMainEntity.class);
final Dataset<RelatedDataset> typedRels = ClusterUtils final Dataset<RelatedDataset> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class); .readPath(spark, workingDir + "/relatedDatasets", RelatedDataset.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator() final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
.toColumn(); .toColumn();

View File

@ -40,10 +40,10 @@ public class JoinStep4Job {
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step4"; final String joinedEntitiesPath = workingDir + "/joinedEntities_step4";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath); log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep4Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class); .readPath(spark, workingDir + "/joinedEntities_step3", OaBrokerMainEntity.class);
final Dataset<RelatedPublication> typedRels = ClusterUtils final Dataset<RelatedPublication> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class); .readPath(spark, workingDir + "/relatedPublications", RelatedPublication.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator() final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
.toColumn(); .toColumn();

View File

@ -36,7 +36,7 @@ import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
public class PartitionEventsByDsIdJob { public class PartitionEventsByDsIdJob {
private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class); private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class);
private static final String OPENDOAR_NSPREFIX = "10|opendoar____::"; private static final String OPENDOAR_NSPREFIX = "opendoar____::";
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
@ -55,10 +55,10 @@ public class PartitionEventsByDsIdJob {
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events"; final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath); log.info("eventsPath: {}", eventsPath);
final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId"; final String partitionPath = parser.get("outputDir") + "/eventsByOpendoarId";
log.info("partitionPath: {}", partitionPath); log.info("partitionPath: {}", partitionPath);
final String opendoarIds = parser.get("opendoarIds"); final String opendoarIds = parser.get("opendoarIds");
@ -91,6 +91,7 @@ public class PartitionEventsByDsIdJob {
.write() .write()
.partitionBy("group") .partitionBy("group")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(partitionPath); .json(partitionPath);
}); });
@ -122,6 +123,7 @@ public class PartitionEventsByDsIdJob {
final ShortEventMessageWithGroupId res = new ShortEventMessageWithGroupId(); final ShortEventMessageWithGroupId res = new ShortEventMessageWithGroupId();
res.setEventId(e.getEventId());
res.setOriginalId(payload.getResult().getOriginalId()); res.setOriginalId(payload.getResult().getOriginalId());
res.setTitle(payload.getResult().getTitles().stream().filter(StringUtils::isNotBlank).findFirst().orElse(null)); res.setTitle(payload.getResult().getTitles().stream().filter(StringUtils::isNotBlank).findFirst().orElse(null));
res.setTopic(e.getTopic()); res.setTopic(e.getTopic());

View File

@ -45,10 +45,10 @@ public class PrepareGroupsJob {
final String graphPath = parser.get("graphPath"); final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath); log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String groupsPath = workingPath + "/duplicates"; final String groupsPath = workingDir + "/duplicates";
log.info("groupsPath: {}", groupsPath); log.info("groupsPath: {}", groupsPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -60,10 +60,10 @@ public class PrepareGroupsJob {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups"); final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");
final Dataset<OaBrokerMainEntity> results = ClusterUtils final Dataset<OaBrokerMainEntity> results = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class); .readPath(spark, workingDir + "/joinedEntities_step4", OaBrokerMainEntity.class);
final Dataset<Relation> mergedRels = ClusterUtils final Dataset<Relation> mergedRels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class) .loadRelations(graphPath, spark)
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator() final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()

View File

@ -42,10 +42,10 @@ public class PrepareRelatedDatasetsJob {
final String graphPath = parser.get("graphPath"); final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath); log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedDatasets"; final String relsPath = workingDir + "/relatedDatasets";
log.info("relsPath: {}", relsPath); log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -62,7 +62,7 @@ public class PrepareRelatedDatasetsJob {
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class)); .map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
final Dataset<Relation> rels = ClusterUtils final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class) .loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
@ -72,7 +72,8 @@ public class PrepareRelatedDatasetsJob {
final Dataset<RelatedDataset> dataset = rels final Dataset<RelatedDataset> dataset = rels
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner") .joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> { .map(t -> {
final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2); final RelatedDataset rel = new RelatedDataset(t._1.getSource(),
t._2);
rel.getRelDataset().setRelType(t._1.getRelClass()); rel.getRelDataset().setRelType(t._1.getRelClass());
return rel; return rel;
}, Encoders.bean(RelatedDataset.class)); }, Encoders.bean(RelatedDataset.class));

View File

@ -48,10 +48,10 @@ public class PrepareRelatedDatasourcesJob {
final String graphPath = parser.get("graphPath"); final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath); log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedDatasources"; final String relsPath = workingDir + "/relatedDatasources";
log.info("relsPath: {}", relsPath); log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();

View File

@ -44,10 +44,10 @@ public class PrepareRelatedProjectsJob {
final String graphPath = parser.get("graphPath"); final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath); log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedProjects"; final String relsPath = workingDir + "/relatedProjects";
log.info("relsPath: {}", relsPath); log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -64,7 +64,7 @@ public class PrepareRelatedProjectsJob {
.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class)); .map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
final Dataset<Relation> rels = ClusterUtils final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class) .loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))

View File

@ -43,10 +43,10 @@ public class PrepareRelatedPublicationsJob {
final String graphPath = parser.get("graphPath"); final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath); log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedPublications"; final String relsPath = workingDir + "/relatedPublications";
log.info("relsPath: {}", relsPath); log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -65,7 +65,7 @@ public class PrepareRelatedPublicationsJob {
Encoders.bean(OaBrokerRelatedPublication.class)); Encoders.bean(OaBrokerRelatedPublication.class));
final Dataset<Relation> rels = ClusterUtils final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class) .loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
@ -75,7 +75,8 @@ public class PrepareRelatedPublicationsJob {
final Dataset<RelatedPublication> dataset = rels final Dataset<RelatedPublication> dataset = rels
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner") .joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> { .map(t -> {
final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2); final RelatedPublication rel = new RelatedPublication(
t._1.getSource(), t._2);
rel.getRelPublication().setRelType(t._1.getRelClass()); rel.getRelPublication().setRelType(t._1.getRelClass());
return rel; return rel;
}, Encoders.bean(RelatedPublication.class)); }, Encoders.bean(RelatedPublication.class));

View File

@ -44,10 +44,10 @@ public class PrepareRelatedSoftwaresJob {
final String graphPath = parser.get("graphPath"); final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath); log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedSoftwares"; final String relsPath = workingDir + "/relatedSoftwares";
log.info("relsPath: {}", relsPath); log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -64,7 +64,7 @@ public class PrepareRelatedSoftwaresJob {
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class)); .map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
final Dataset<Relation> rels = ClusterUtils final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class) .loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))

View File

@ -44,10 +44,10 @@ public class PrepareSimpleEntititiesJob {
final String graphPath = parser.get("graphPath"); final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath); log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath"); final String workingDir = parser.get("workingDir");
log.info("workingPath: {}", workingPath); log.info("workingDir: {}", workingDir);
final String simpleEntitiesPath = workingPath + "/simpleEntities"; final String simpleEntitiesPath = workingDir + "/simpleEntities";
log.info("simpleEntitiesPath: {}", simpleEntitiesPath); log.info("simpleEntitiesPath: {}", simpleEntitiesPath);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();

View File

@ -16,7 +16,24 @@ public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMissingSubject() { public EnrichMissingSubject() {
super(20, super(20,
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()), s -> {
switch (s.getType().toLowerCase()) {
case "acm":
return Topic.ENRICH_MISSING_SUBJECT_ACM;
case "arxiv":
return Topic.ENRICH_MISSING_SUBJECT_ARXIV;
case "ddc":
return Topic.ENRICH_MISSING_SUBJECT_DDC;
case "jel":
return Topic.ENRICH_MISSING_SUBJECT_JEL;
case "mesh":
return Topic.ENRICH_MISSING_SUBJECT_MESHEUROPMC;
case "rvk":
return Topic.ENRICH_MISSING_SUBJECT_RVK;
default:
return null;
}
},
(p, s) -> p.getSubjects().add(s), (p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s)); s -> subjectAsString(s));
} }

View File

@ -16,7 +16,24 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMoreSubject() { public EnrichMoreSubject() {
super(20, super(20,
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()), s -> {
switch (s.getType().toLowerCase()) {
case "acm":
return Topic.ENRICH_MORE_SUBJECT_ACM;
case "arxiv":
return Topic.ENRICH_MORE_SUBJECT_ARXIV;
case "ddc":
return Topic.ENRICH_MORE_SUBJECT_DDC;
case "jel":
return Topic.ENRICH_MORE_SUBJECT_JEL;
case "mesh":
return Topic.ENRICH_MORE_SUBJECT_MESHEUROPMC;
case "rvk":
return Topic.ENRICH_MORE_SUBJECT_RVK;
default:
return null;
}
},
(p, s) -> p.getSubjects().add(s), (p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s)); s -> subjectAsString(s));
} }

View File

@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class ClusterUtils { public class ClusterUtils {
@ -30,6 +31,16 @@ public class ClusterUtils {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
} }
public static Dataset<Relation> loadRelations(final String graphPath, final SparkSession spark) {
return ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.map(r -> {
r.setSource(ConversionUtils.cleanOpenaireId(r.getSource()));
r.setTarget(ConversionUtils.cleanOpenaireId(r.getTarget()));
return r;
}, Encoders.bean(Relation.class));
}
public static <R> Dataset<R> readPath( public static <R> Dataset<R> readPath(
final SparkSession spark, final SparkSession spark,
final String inputPath, final String inputPath,
@ -67,6 +78,7 @@ public class ClusterUtils {
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz)) .map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(path); .json(path);
} }

View File

@ -74,7 +74,7 @@ public class ConversionUtils {
} }
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset(); final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
res.setOpenaireId(d.getId()); res.setOpenaireId(cleanOpenaireId(d.getId()));
res.setOriginalId(first(d.getOriginalId())); res.setOriginalId(first(d.getOriginalId()));
res.setTitle(structPropValue(d.getTitle())); res.setTitle(structPropValue(d.getTitle()));
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid)); res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
@ -89,7 +89,7 @@ public class ConversionUtils {
} }
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication(); final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
res.setOpenaireId(p.getId()); res.setOpenaireId(cleanOpenaireId(p.getId()));
res.setOriginalId(first(p.getOriginalId())); res.setOriginalId(first(p.getOriginalId()));
res.setTitle(structPropValue(p.getTitle())); res.setTitle(structPropValue(p.getTitle()));
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid)); res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
@ -106,7 +106,7 @@ public class ConversionUtils {
final OaBrokerMainEntity res = new OaBrokerMainEntity(); final OaBrokerMainEntity res = new OaBrokerMainEntity();
res.setOpenaireId(result.getId()); res.setOpenaireId(cleanOpenaireId(result.getId()));
res.setOriginalId(first(result.getOriginalId())); res.setOriginalId(first(result.getOriginalId()));
res.setTypology(classId(result.getResulttype())); res.setTypology(classId(result.getResulttype()));
res.setTitles(structPropList(result.getTitle())); res.setTitles(structPropList(result.getTitle()));
@ -129,6 +129,10 @@ public class ConversionUtils {
return res; return res;
} }
public static String cleanOpenaireId(final String id) {
return id.contains("|") ? StringUtils.substringAfter(id, "|") : id;
}
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) { private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
if (author == null) { if (author == null) {
return null; return null;
@ -188,7 +192,7 @@ public class ConversionUtils {
} }
final OaBrokerProject res = new OaBrokerProject(); final OaBrokerProject res = new OaBrokerProject();
res.setOpenaireId(p.getId()); res.setOpenaireId(cleanOpenaireId(p.getId()));
res.setTitle(fieldValue(p.getTitle())); res.setTitle(fieldValue(p.getTitle()));
res.setAcronym(fieldValue(p.getAcronym())); res.setAcronym(fieldValue(p.getAcronym()));
res.setCode(fieldValue(p.getCode())); res.setCode(fieldValue(p.getCode()));
@ -214,7 +218,7 @@ public class ConversionUtils {
} }
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware(); final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
res.setOpenaireId(sw.getId()); res.setOpenaireId(cleanOpenaireId(sw.getId()));
res.setName(structPropValue(sw.getTitle())); res.setName(structPropValue(sw.getTitle()));
res.setDescription(fieldValue(sw.getDescription())); res.setDescription(fieldValue(sw.getDescription()));
res.setRepository(fieldValue(sw.getCodeRepositoryUrl())); res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
@ -230,7 +234,7 @@ public class ConversionUtils {
final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource(); final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource();
res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname()))); res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname())));
res.setOpenaireId(ds.getId()); res.setOpenaireId(cleanOpenaireId(ds.getId()));
res.setType(classId(ds.getDatasourcetype())); res.setType(classId(ds.getDatasourcetype()));
return res; return res;
} }

View File

@ -59,9 +59,18 @@ public class DatasourceRelationsAccumulator implements Serializable {
final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator(); final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
collectedFromSet collectedFromSet
.stream() .stream()
.map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL)) .map(
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
BrokerConstants.COLLECTED_FROM_REL))
.forEach(res::addTuple); .forEach(res::addTuple);
hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple);
hostedBySet
.stream()
.map(
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
BrokerConstants.HOSTED_BY_REL))
.forEach(res::addTuple);
return res; return res;
} }

View File

@ -76,6 +76,7 @@ public class EventFinder {
final Set<String> dsIdWhitelist, final Set<String> dsIdWhitelist,
final Set<String> dsIdBlacklist, final Set<String> dsIdBlacklist,
final Set<String> dsTypeWhitelist, final Set<String> dsTypeWhitelist,
final Set<String> topicWhitelist,
final Map<String, LongAccumulator> accumulators) { final Map<String, LongAccumulator> accumulators) {
final List<UpdateInfo<?>> list = new ArrayList<>(); final List<UpdateInfo<?>> list = new ArrayList<>();
@ -84,7 +85,13 @@ public class EventFinder {
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) { for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) { if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
for (final UpdateMatcher<?> matcher : matchers) { for (final UpdateMatcher<?> matcher : matchers) {
list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)); for (final UpdateInfo<?> info : matcher
.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
if (topicWhitelist == null || topicWhitelist.isEmpty()
|| topicWhitelist.contains(info.getTopic().getPath())) {
list.add(info);
}
}
} }
} }
} }

View File

@ -0,0 +1,9 @@
[
{
"paramName": "o",
"paramLongName": "outputDir",
"paramDescription": "the path where the data are stored",
"paramRequired": true
}
]

View File

@ -7,7 +7,7 @@
}, },
{ {
"paramName": "o", "paramName": "o",
"paramLongName": "workingPath", "paramLongName": "workingDir",
"paramDescription": "the path where the temporary data will be stored", "paramDescription": "the path where the temporary data will be stored",
"paramRequired": true "paramRequired": true
} }

View File

@ -6,7 +6,7 @@
<description>the path where the graph is stored</description> <description>the path where the graph is stored</description>
</property> </property>
<property> <property>
<name>workingPath</name> <name>outputDir</name>
<description>the path where the the generated data will be stored</description> <description>the path where the the generated data will be stored</description>
</property> </property>
<property> <property>
@ -24,6 +24,11 @@
<value>-</value> <value>-</value>
<description>a black list (comma separeted, - for empty list) of datasource ids</description> <description>a black list (comma separeted, - for empty list) of datasource ids</description>
</property> </property>
<property>
<name>topicWhitelist</name>
<value>*</value>
<description>a white list (comma separeted, * for all) of topics</description>
</property>
<property> <property>
<name>esEventIndexName</name> <name>esEventIndexName</name>
<description>the elasticsearch index name for events</description> <description>the elasticsearch index name for events</description>
@ -36,6 +41,26 @@
<name>esIndexHost</name> <name>esIndexHost</name>
<description>the elasticsearch host</description> <description>the elasticsearch host</description>
</property> </property>
<property>
<name>esBatchWriteRetryCount</name>
<value>8</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esBatchWriteRetryWait</name>
<value>60s</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esBatchSizeEntries</name>
<value>200</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esNodesWanOnly</name>
<value>true</value>
<description>an ES configuration property</description>
</property>
<property> <property>
<name>maxIndexedEventsForDsAndTopic</name> <name>maxIndexedEventsForDsAndTopic</name>
<description>the max number of events for each couple (ds/topic)</description> <description>the max number of events for each couple (ds/topic)</description>
@ -111,15 +136,25 @@
</configuration> </configuration>
</global> </global>
<start to="ensure_working_path"/> <start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="ensure_output_dir">${wf:conf('resumeFrom') eq 'ensure_output_dir'}</case>
<case to="index_event_subset">${wf:conf('resumeFrom') eq 'index_event_subset'}</case>
<case to="stats">${wf:conf('resumeFrom') eq 'stats'}</case>
<case to="index_notifications">${wf:conf('resumeFrom') eq 'index_notifications'}</case>
<default to="ensure_output_dir"/>
</switch>
</decision>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="ensure_working_path"> <action name="ensure_output_dir">
<fs> <fs>
<mkdir path='${workingPath}'/> <mkdir path='${outputDir}'/>
</fs> </fs>
<ok to="start_entities_and_rels"/> <ok to="start_entities_and_rels"/>
<error to="Kill"/> <error to="Kill"/>
@ -152,7 +187,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="wait_entities_and_rels"/> <ok to="wait_entities_and_rels"/>
<error to="Kill"/> <error to="Kill"/>
@ -176,7 +211,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="wait_entities_and_rels"/> <ok to="wait_entities_and_rels"/>
<error to="Kill"/> <error to="Kill"/>
@ -201,7 +236,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="wait_entities_and_rels"/> <ok to="wait_entities_and_rels"/>
<error to="Kill"/> <error to="Kill"/>
@ -225,7 +260,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="wait_entities_and_rels"/> <ok to="wait_entities_and_rels"/>
<error to="Kill"/> <error to="Kill"/>
@ -249,7 +284,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="wait_entities_and_rels"/> <ok to="wait_entities_and_rels"/>
<error to="Kill"/> <error to="Kill"/>
@ -273,7 +308,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="wait_entities_and_rels"/> <ok to="wait_entities_and_rels"/>
<error to="Kill"/> <error to="Kill"/>
@ -299,7 +334,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="join_entities_step1"/> <ok to="join_entities_step1"/>
<error to="Kill"/> <error to="Kill"/>
@ -323,7 +358,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="join_entities_step2"/> <ok to="join_entities_step2"/>
<error to="Kill"/> <error to="Kill"/>
@ -347,7 +382,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="join_entities_step3"/> <ok to="join_entities_step3"/>
<error to="Kill"/> <error to="Kill"/>
@ -371,7 +406,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="join_entities_step4"/> <ok to="join_entities_step4"/>
<error to="Kill"/> <error to="Kill"/>
@ -395,7 +430,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="prepare_groups"/> <ok to="prepare_groups"/>
<error to="Kill"/> <error to="Kill"/>
@ -419,7 +454,7 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
</spark> </spark>
<ok to="generate_events"/> <ok to="generate_events"/>
<error to="Kill"/> <error to="Kill"/>
@ -442,10 +477,12 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg> <arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg> <arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg> <arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
<arg>--topicWhitelist</arg><arg>${topicWhitelist}</arg>
</spark> </spark>
<ok to="index_event_subset"/> <ok to="index_event_subset"/>
<error to="Kill"/> <error to="Kill"/>
@ -468,38 +505,16 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--index</arg><arg>${esEventIndexName}</arg> <arg>--index</arg><arg>${esEventIndexName}</arg>
<arg>--esHost</arg><arg>${esIndexHost}</arg> <arg>--esHost</arg><arg>${esIndexHost}</arg>
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg> <arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg> <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark> </spark>
<ok to="index_notifications"/>
<error to="Kill"/>
</action>
<action name="index_notifications">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>IndexNotificationsOnESJob</name>
<class>eu.dnetlib.dhp.broker.oa.IndexNotificationsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
<arg>--esHost</arg><arg>${esIndexHost}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark>
<ok to="stats"/> <ok to="stats"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -521,16 +536,46 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--dbUrl</arg><arg>${brokerDbUrl}</arg> <arg>--dbUrl</arg><arg>${brokerDbUrl}</arg>
<arg>--dbUser</arg><arg>${brokerDbUser}</arg> <arg>--dbUser</arg><arg>${brokerDbUser}</arg>
<arg>--dbPassword</arg><arg>${brokerDbPassword}</arg> <arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg> <arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark> </spark>
<ok to="index_notifications"/>
<error to="Kill"/>
</action>
<action name="index_notifications">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>IndexNotificationsOnESJob</name>
<class>eu.dnetlib.dhp.broker.oa.IndexNotificationsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
<arg>--esHost</arg><arg>${esIndexHost}</arg>
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,7 +1,13 @@
[ [
{
"paramName": "wp",
"paramLongName": "workingDir",
"paramDescription": "the path where the temporary data are stored",
"paramRequired": true
},
{ {
"paramName": "o", "paramName": "o",
"paramLongName": "workingPath", "paramLongName": "outputDir",
"paramDescription": "the path where the generated events will be stored", "paramDescription": "the path where the generated events will be stored",
"paramRequired": true "paramRequired": true
}, },
@ -22,5 +28,11 @@
"paramLongName": "datasourceIdBlacklist", "paramLongName": "datasourceIdBlacklist",
"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids", "paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
"paramRequired": true "paramRequired": true
},
{
"paramName": "topicWhitelist",
"paramLongName": "topicWhitelist",
"paramDescription": "a white list (comma separeted, * for all) of topics",
"paramRequired": true
} }
] ]

Some files were not shown because too many files have changed in this diff Show More