diff --git a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml deleted file mode 100644 index 14959d630e..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml +++ /dev/null @@ -1,82 +0,0 @@ - - - - dhp-workflows - eu.dnetlib.dhp - 1.2.4-SNAPSHOT - - 4.0.0 - - dhp-dedup-scholexplorer - - - - - net.alchim31.maven - scala-maven-plugin - 4.0.1 - - - scala-compile-first - initialize - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - ${scala.version} - - - - - - - - - - org.apache.spark - spark-core_2.11 - - - org.apache.spark - spark-sql_2.11 - - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - - eu.dnetlib - dnet-pace-core - - - org.apache.spark - spark-graphx_2.11 - - - com.fasterxml.jackson.core - jackson-databind - - - com.fasterxml.jackson.core - jackson-core - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java deleted file mode 100644 index db55434d87..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java +++ /dev/null @@ -1,121 +0,0 @@ - -package eu.dnetlib.dedup; - -import static java.util.Collections.reverseOrder; -import static java.util.Map.Entry.comparingByValue; -import static java.util.stream.Collectors.toMap; - -import static org.apache.commons.lang.StringUtils.endsWith; -import static org.apache.commons.lang.StringUtils.substringBefore; - -import java.time.Year; -import java.util.*; -import java.util.stream.Collectors; - -import org.apache.commons.lang.StringUtils; - -import eu.dnetlib.dhp.schema.oaf.Field; - -public class DatePicker { - - private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; - private static final String DATE_DEFAULT_SUFFIX = "01-01"; - private static final int YEAR_LB = 1300; - private static final int YEAR_UB = Year.now().getValue() + 5; - - public static Field pick(final Collection dateofacceptance) { - - final Map frequencies = dateofacceptance - .parallelStream() - .filter(StringUtils::isNotBlank) - .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); - - if (frequencies.isEmpty()) { - return new Field<>(); - } - - final Field date = new Field<>(); - date.setValue(frequencies.keySet().iterator().next()); - - // let's sort this map by values first, filtering out invalid dates - final Map sorted = frequencies - .entrySet() - .stream() - .filter(d -> StringUtils.isNotBlank(d.getKey())) - .filter(d -> d.getKey().matches(DATE_PATTERN)) - .filter(d -> inRange(d.getKey())) - .sorted(reverseOrder(comparingByValue())) - .collect( - toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); - - // shortcut - if (sorted.size() == 0) { - return date; - } - - // voting method (1/3 + 1) wins - if (sorted.size() >= 3) { - final int acceptThreshold = (sorted.size() / 3) + 1; - final List accepted = sorted - .entrySet() - .stream() - .filter(e -> e.getValue() >= acceptThreshold) - .map(e -> e.getKey()) - .collect(Collectors.toList()); - - // cannot find strong majority - if (accepted.isEmpty()) { - final int max = sorted.values().iterator().next(); - Optional first = sorted - .entrySet() - .stream() - .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) - .map(Map.Entry::getKey) - .findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } - - date.setValue(sorted.keySet().iterator().next()); - return date; - } - - if (accepted.size() == 1) { - date.setValue(accepted.get(0)); - return date; - } else { - final Optional first = accepted - .stream() - .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) - .findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } - - return date; - } - - // 1st non YYYY-01-01 is returned - } else { - if (sorted.size() == 2) { - for (Map.Entry e : sorted.entrySet()) { - if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { - date.setValue(e.getKey()); - return date; - } - } - } - - // none of the dates seems good enough, return the 1st one - date.setValue(sorted.keySet().iterator().next()); - return date; - } - } - - private static boolean inRange(final String date) { - final int year = Integer.parseInt(substringBefore(date, "-")); - return year >= YEAR_LB && year <= YEAR_UB; - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java deleted file mode 100644 index bba277ad67..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ /dev/null @@ -1,327 +0,0 @@ - -package eu.dnetlib.dedup; - -import java.util.Collection; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; - -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; - -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; -import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; -import scala.Tuple2; - -public class DedupRecordFactory { - - public static JavaRDD createDedupRecord( - final JavaSparkContext sc, - final SparkSession spark, - final String mergeRelsInputPath, - final String entitiesInputPath, - final OafEntityType entityType, - final DedupConfig dedupConf) { - long ts = System.currentTimeMillis(); - // - final JavaPairRDD inputJsonEntities = spark - .read() - .load(entitiesInputPath) - .as(Encoders.kryo(Oaf.class)) - .map( - (MapFunction) p -> new org.codehaus.jackson.map.ObjectMapper().writeValueAsString(p), - Encoders.STRING()) - .javaRDD() - .mapToPair( - (PairFunction) it -> new Tuple2<>( - MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)); - - // : source is the dedup_id, target is the id of the mergedIn - JavaPairRDD mergeRels = spark - .read() - .load(mergeRelsInputPath) - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .mapToPair( - (PairFunction) r -> new Tuple2(r.getTarget(), r.getSource())); - - // - final JavaPairRDD joinResult = mergeRels - .join(inputJsonEntities) - .mapToPair( - (PairFunction>, String, String>) Tuple2::_2); - - JavaPairRDD> sortedJoinResult = joinResult.groupByKey(); - - switch (entityType) { - case publication: - return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts)); - case dataset: - return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts)); - case project: - return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts)); - case software: - return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts)); - case datasource: - return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts)); - case organization: - return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts)); - case otherresearchproduct: - return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts)); - default: - return null; - } - } - - private static DLIPublication publicationMerger(Tuple2> e, final long ts) { - - DLIPublication p = new DLIPublication(); // the result of the merge, to be returned at the end - - p.setId(e._1()); - - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - - final Collection dateofacceptance = Lists.newArrayList(); - - if (e._2() != null) - e - ._2() - .forEach( - pub -> { - try { - DLIPublication publication = mapper.readValue(pub, DLIPublication.class); - - p.mergeFrom(publication); - p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor())); - // add to the list if they are not null - if (publication.getDateofacceptance() != null) - dateofacceptance.add(publication.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - p.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (p.getDataInfo() == null) - p.setDataInfo(new DataInfo()); - p.getDataInfo().setTrust("0.9"); - p.setLastupdatetimestamp(ts); - return p; - } - - private static DLIDataset datasetMerger(Tuple2> e, final long ts) { - - DLIDataset d = new DLIDataset(); // the result of the merge, to be returned at the end - - d.setId(e._1()); - - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - - final Collection dateofacceptance = Lists.newArrayList(); - - if (e._2() != null) - e - ._2() - .forEach( - dat -> { - try { - Dataset dataset = mapper.readValue(dat, Dataset.class); - - d.mergeFrom(dataset); - d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor())); - // add to the list if they are not null - if (dataset.getDateofacceptance() != null) - dateofacceptance.add(dataset.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - d.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (d.getDataInfo() == null) - d.setDataInfo(new DataInfo()); - d.getDataInfo().setTrust("0.9"); - d.setLastupdatetimestamp(ts); - return d; - } - - private static Project projectMerger(Tuple2> e, final long ts) { - - Project p = new Project(); // the result of the merge, to be returned at the end - - p.setId(e._1()); - - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - if (e._2() != null) - e - ._2() - .forEach( - proj -> { - try { - Project project = mapper.readValue(proj, Project.class); - - p.mergeFrom(project); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (p.getDataInfo() == null) - p.setDataInfo(new DataInfo()); - p.getDataInfo().setTrust("0.9"); - p.setLastupdatetimestamp(ts); - return p; - } - - private static Software softwareMerger(Tuple2> e, final long ts) { - - Software s = new Software(); // the result of the merge, to be returned at the end - - s.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); - if (e._2() != null) - e - ._2() - .forEach( - soft -> { - try { - Software software = mapper.readValue(soft, Software.class); - - s.mergeFrom(software); - s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor())); - // add to the list if they are not null - if (software.getDateofacceptance() != null) - dateofacceptance.add(software.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - s.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (s.getDataInfo() == null) - s.setDataInfo(new DataInfo()); - s.getDataInfo().setTrust("0.9"); - s.setLastupdatetimestamp(ts); - return s; - } - - private static Datasource datasourceMerger(Tuple2> e, final long ts) { - Datasource d = new Datasource(); // the result of the merge, to be returned at the end - d.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - if (e._2() != null) - e - ._2() - .forEach( - dat -> { - try { - Datasource datasource = mapper.readValue(dat, Datasource.class); - - d.mergeFrom(datasource); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (d.getDataInfo() == null) - d.setDataInfo(new DataInfo()); - d.getDataInfo().setTrust("0.9"); - d.setLastupdatetimestamp(ts); - return d; - } - - private static Organization organizationMerger( - Tuple2> e, final long ts) { - - Organization o = new Organization(); // the result of the merge, to be returned at the end - - o.setId(e._1()); - - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - - StringBuilder trust = new StringBuilder("0.0"); - - if (e._2() != null) - e - ._2() - .forEach( - pub -> { - try { - Organization organization = mapper.readValue(pub, Organization.class); - - final String currentTrust = organization.getDataInfo().getTrust(); - if (!"1.0".equals(currentTrust)) { - trust.setLength(0); - trust.append(currentTrust); - } - o.mergeFrom(organization); - - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - - if (o.getDataInfo() == null) { - o.setDataInfo(new DataInfo()); - } - if (o.getDataInfo() == null) - o.setDataInfo(new DataInfo()); - o.getDataInfo().setTrust("0.9"); - o.setLastupdatetimestamp(ts); - - return o; - } - - private static OtherResearchProduct otherresearchproductMerger( - Tuple2> e, final long ts) { - - OtherResearchProduct o = new OtherResearchProduct(); // the result of the merge, to be - // returned at the end - - o.setId(e._1()); - - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - - final Collection dateofacceptance = Lists.newArrayList(); - - if (e._2() != null) - e - ._2() - .forEach( - orp -> { - try { - OtherResearchProduct otherResearchProduct = mapper - .readValue(orp, OtherResearchProduct.class); - - o.mergeFrom(otherResearchProduct); - o - .setAuthor( - DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor())); - // add to the list if they are not null - if (otherResearchProduct.getDateofacceptance() != null) - dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (o.getDataInfo() == null) - o.setDataInfo(new DataInfo()); - o.setDateofacceptance(DatePicker.pick(dateofacceptance)); - o.getDataInfo().setTrust("0.9"); - o.setLastupdatetimestamp(ts); - return o; - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java deleted file mode 100644 index 364b49c16e..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ /dev/null @@ -1,239 +0,0 @@ - -package eu.dnetlib.dedup; - -import java.io.IOException; -import java.io.StringWriter; -import java.nio.charset.StandardCharsets; -import java.security.MessageDigest; -import java.text.Normalizer; -import java.util.*; -import java.util.stream.Collectors; - -import org.apache.commons.codec.binary.Hex; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.util.LongAccumulator; - -import com.google.common.collect.Sets; -import com.wcohen.ss.JaroWinkler; - -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.Person; -import scala.Tuple2; - -public class DedupUtility { - private static final Double THRESHOLD = 0.95; - - public static Map constructAccumulator( - final DedupConfig dedupConf, final SparkContext context) { - - Map accumulators = new HashMap<>(); - - String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); - accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = String - .format( - "%s::%s", - dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); - accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = String - .format( - "%s::%s", - dedupConf.getWf().getEntityType(), - String - .format( - "Skipped records for count(%s) >= %s", - dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); - accumulators.put(acc3, context.longAccumulator(acc3)); - String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); - accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); - accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = String - .format( - "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); - accumulators.put(acc6, context.longAccumulator(acc6)); - - return accumulators; - } - - public static JavaRDD loadDataFromHDFS(String path, JavaSparkContext context) { - return context.textFile(path); - } - - public static void deleteIfExists(String path) throws IOException { - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - if (fileSystem.exists(new Path(path))) { - fileSystem.delete(new Path(path), true); - } - } - - public static DedupConfig loadConfigFromHDFS(String path) throws IOException { - - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); - - return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); - } - - static String readFromClasspath(final String filename, final Class clazz) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(clazz.getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } - - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { - return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); - } - - public static String md5(final String s) { - try { - final MessageDigest md = MessageDigest.getInstance("MD5"); - md.update(s.getBytes(StandardCharsets.UTF_8)); - return new String(Hex.encodeHex(md.digest())); - } catch (final Exception e) { - System.err.println("Error creating id"); - return null; - } - } - - public static List mergeAuthor(final List a, final List b) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base, enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); - - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } - enrichPidFromList(base, enrich); - return base; - } - - private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) - return; - final Map basePidAuthorMap = base - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - - final List> pidToEnrich = enrich - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> a - .getPid() - .stream() - .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); - - pidToEnrich - .forEach( - a -> { - Optional> simAuhtor = base - .stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { - Author r = simAuhtor.get()._2(); - r.getPid().add(a._1()); - } - }); - } - - public static String createEntityPath(final String basePath, final String entityType) { - return String.format("%s/%s", basePath, entityType); - } - - public static String createSimRelPath(final String basePath, final String entityType) { - return String.format("%s/%s/simRel", basePath, entityType); - } - - public static String createMergeRelPath(final String basePath, final String entityType) { - return String.format("%s/%s/mergeRel", basePath, entityType); - } - - private static Double sim(Author a, Author b) { - - final Person pa = parse(a); - final Person pb = parse(b); - - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler() - .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } - - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } - - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } - - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } - - private static int countAuthorsPids(List authors) { - if (authors == null) - return 0; - - return (int) authors.stream().filter(DedupUtility::hasPid).count(); - } - - private static int authorsSize(List authors) { - if (authors == null) - return 0; - return authors.size(); - } - - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) - return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java deleted file mode 100644 index e7d49be988..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java +++ /dev/null @@ -1,182 +0,0 @@ - -package eu.dnetlib.dedup; - -import java.util.*; -import java.util.stream.Collectors; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.api.java.function.PairFlatMapFunction; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.util.LongAccumulator; - -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.BlockProcessor; -import eu.dnetlib.pace.util.MapDocumentUtil; -import scala.Serializable; -import scala.Tuple2; - -public class Deduper implements Serializable { - - private static final Log log = LogFactory.getLog(Deduper.class); - - /** - * @return the list of relations generated by the deduplication - * @param: the spark context - * @param: list of JSON entities to be deduped - * @param: the dedup configuration - */ - public static JavaPairRDD dedup( - JavaSparkContext context, JavaRDD entities, DedupConfig config) { - - Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - - // create vertexes of the graph: - JavaPairRDD mapDocs = mapToVertexes(context, entities, config); - - // create blocks for deduplication - JavaPairRDD> blocks = createBlocks(context, mapDocs, config); - - // create relations by comparing only elements in the same group - return computeRelations(context, blocks, config); - - // final RDD> edgeRdd = relationRDD.map(it -> new - // Edge<>(it._1().hashCode(), - // it._2().hashCode(), "equalTo")).rdd(); - // - // RDD> vertexes = - // mapDocs.mapToPair((PairFunction, Object, MapDocument>) t -> - // new - // Tuple2((long) t._1().hashCode(), t._2())).rdd(); - // accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value())); - // - // return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); - } - - /** - * @return the list of relations generated by the deduplication - * @param: the spark context - * @param: list of blocks - * @param: the dedup configuration - */ - public static JavaPairRDD computeRelations( - JavaSparkContext context, - JavaPairRDD> blocks, - DedupConfig config) { - - Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - - return blocks - .flatMapToPair( - (PairFlatMapFunction>, String, String>) it -> { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config).process(it._1(), it._2(), reporter); - return reporter.getRelations().iterator(); - }) - .mapToPair( - (PairFunction, String, Tuple2>) item -> new Tuple2>( - item._1() + item._2(), item)) - .reduceByKey((a, b) -> a) - .mapToPair( - (PairFunction>, String, String>) Tuple2::_2); - } - - /** - * @return the list of blocks based on clustering of dedup configuration - * @param: the spark context - * @param: list of entities: - * @param: the dedup configuration - */ - public static JavaPairRDD> createBlocks( - JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { - return mapDocs - // the reduce is just to be sure that we haven't document with same id - .reduceByKey((a, b) -> a) - .map(Tuple2::_2) - // Clustering: from to List - .flatMapToPair( - (PairFlatMapFunction) a -> DedupUtility - .getGroupingKeys(config, a) - .stream() - .map(it -> new Tuple2<>(it, a)) - .collect(Collectors.toList()) - .iterator()) - .groupByKey(); - } - - public static JavaPairRDD> createsortedBlocks( - JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { - final String of = config.getWf().getOrderField(); - final int maxQueueSize = config.getWf().getGroupMaxSize(); - return mapDocs - // the reduce is just to be sure that we haven't document with same id - .reduceByKey((a, b) -> a) - .map(Tuple2::_2) - // Clustering: from to List - .flatMapToPair( - (PairFlatMapFunction>) a -> DedupUtility - .getGroupingKeys(config, a) - .stream() - .map( - it -> { - List tmp = new ArrayList<>(); - tmp.add(a); - return new Tuple2<>(it, tmp); - }) - .collect(Collectors.toList()) - .iterator()) - .reduceByKey( - (Function2, List, List>) (v1, v2) -> { - v1.addAll(v2); - v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue())); - if (v1.size() > maxQueueSize) - return new ArrayList<>(v1.subList(0, maxQueueSize)); - return v1; - }); - } - - /** - * @return the list of vertexes: - * @param: the spark context - * @param: list of JSON entities - * @param: the dedup configuration - */ - public static JavaPairRDD mapToVertexes( - JavaSparkContext context, JavaRDD entities, DedupConfig config) { - - return entities - .mapToPair( - (PairFunction) s -> { - MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s); - return new Tuple2(mapDocument.getIdentifier(), mapDocument); - }); - } - - public static JavaPairRDD computeRelations2( - JavaSparkContext context, JavaPairRDD> blocks, DedupConfig config) { - Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - - return blocks - .flatMapToPair( - (PairFlatMapFunction>, String, String>) it -> { - try { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter); - return reporter.getRelations().iterator(); - } catch (Exception e) { - throw new RuntimeException(it._2().get(0).getIdentifier(), e); - } - }) - .mapToPair( - (PairFunction, String, Tuple2>) item -> new Tuple2>( - item._1() + item._2(), item)) - .reduceByKey((a, b) -> a) - .mapToPair( - (PairFunction>, String, String>) Tuple2::_2); - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java deleted file mode 100644 index bc99481901..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java +++ /dev/null @@ -1,6 +0,0 @@ - -package eu.dnetlib.dedup; - -public enum OafEntityType { - datasource, organization, project, dataset, otherresearchproduct, software, publication -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java deleted file mode 100644 index 2f0b1e5744..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ /dev/null @@ -1,112 +0,0 @@ - -package eu.dnetlib.dedup; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.graphx.Edge; -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.hash.Hashing; - -import eu.dnetlib.dedup.graph.ConnectedComponent; -import eu.dnetlib.dedup.graph.GraphProcessor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; -import scala.Tuple2; - -public class SparkCreateConnectedComponent { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkCreateConnectedComponent.class - .getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateConnectedComponent.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); - - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - final JavaPairRDD vertexes = spark - .read() - .load(inputPath + "/" + entity) - .as(Encoders.kryo(Oaf.class)) - .map((MapFunction) p -> new ObjectMapper().writeValueAsString(p), Encoders.STRING()) - .javaRDD() - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair( - (PairFunction) s -> new Tuple2(getHashcode(s), s)); - - final Dataset similarityRelations = spark - .read() - .load(DedupUtility.createSimRelPath(targetPath, entity)) - .as(Encoders.bean(Relation.class)); - final RDD> edgeRdd = similarityRelations - .javaRDD() - .map( - it -> new Edge<>( - getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())) - .rdd(); - final JavaRDD cc = GraphProcessor - .findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()) - .toJavaRDD(); - final Dataset mergeRelation = spark - .createDataset( - cc - .filter(k -> k.getDocIds().size() > 1) - .flatMap( - (FlatMapFunction) c -> c - .getDocIds() - .stream() - .flatMap( - id -> { - List tmp = new ArrayList<>(); - Relation r = new Relation(); - r.setSource(c.getCcId()); - r.setTarget(id); - r.setRelClass(ModelConstants.MERGES); - tmp.add(r); - r = new Relation(); - r.setTarget(c.getCcId()); - r.setSource(id); - r.setRelClass(ModelConstants.IS_MERGED_IN); - tmp.add(r); - return tmp.stream(); - }) - .iterator()) - .rdd(), - Encoders.bean(Relation.class)); - mergeRelation - .write() - .mode("overwrite") - .save(DedupUtility.createMergeRelPath(targetPath, entity)); - } - - public static long getHashcode(final String id) { - return Hashing.murmur3_128().hashString(id).asLong(); - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java deleted file mode 100644 index fa0ee1efb8..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ /dev/null @@ -1,59 +0,0 @@ - -package eu.dnetlib.dedup; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.pace.config.DedupConfig; - -public class SparkCreateDedupRecord { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkCreateDedupRecord.class - .getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateDedupRecord.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String sourcePath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String dedupPath = parser.get("dedupPath"); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - final JavaRDD dedupRecord = DedupRecordFactory - .createDedupRecord( - sc, - spark, - DedupUtility.createMergeRelPath(dedupPath, entity), - DedupUtility.createEntityPath(sourcePath, entity), - OafEntityType.valueOf(entity), - dedupConf); - spark - .createDataset(dedupRecord.rdd(), Encoders.kryo(OafEntity.class)) - .write() - .mode(SaveMode.Overwrite) - .save(dedupPath + "/" + entity + "/dedup_records"); -// -// -// dedupRecord -// .map( -// r -> { -// ObjectMapper mapper = new ObjectMapper(); -// return mapper.writeValueAsString(r); -// }) -// .saveAsTextFile(dedupPath + "/" + entity + "/dedup_records"); - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java deleted file mode 100644 index 7adf992cd2..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ /dev/null @@ -1,92 +0,0 @@ - -package eu.dnetlib.dedup; - -import java.util.List; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; -import scala.Tuple2; - -/** - * This Spark class creates similarity relations between entities, saving result - *

- * param request: sourcePath entityType target Path - */ -public class SparkCreateSimRels { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkCreateSimRels.class - .getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateSimRels.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); - // final DedupConfig dedupConf = - // DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - JavaPairRDD mapDocument = spark - .read() - .load(inputPath + "/" + entity) - .as(Encoders.kryo(Oaf.class)) - .map((MapFunction) p -> new ObjectMapper().writeValueAsString(p), Encoders.STRING()) - .javaRDD() - .repartition(1000) - .mapToPair( - s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); - - // create blocks for deduplication - JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); - // JavaPairRDD> blocks = Deduper.createBlocks(sc, - // mapDocument, dedupConf); - - // create relations by comparing only elements in the same group - final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); - // final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, - // dedupConf); - - final JavaRDD isSimilarToRDD = dedupRels - .map( - simRel -> { - final Relation r = new Relation(); - r.setSource(simRel._1()); - r.setTarget(simRel._2()); - r.setRelClass("isSimilarTo"); - return r; - }); - - spark - .createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)) - .write() - .mode("overwrite") - .save(DedupUtility.createSimRelPath(targetPath, entity)); - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java deleted file mode 100644 index 21e72b5b8d..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java +++ /dev/null @@ -1,52 +0,0 @@ - -package eu.dnetlib.dedup; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.spark.util.LongAccumulator; - -import eu.dnetlib.pace.util.Reporter; -import scala.Serializable; -import scala.Tuple2; - -public class SparkReporter implements Serializable, Reporter { - - final List> relations = new ArrayList<>(); - private static final Log log = LogFactory.getLog(SparkReporter.class); - Map accumulators; - - public SparkReporter(Map accumulators) { - this.accumulators = accumulators; - } - - public void incrementCounter( - String counterGroup, - String counterName, - long delta, - Map accumulators) { - - final String accumulatorName = String.format("%s::%s", counterGroup, counterName); - if (accumulators.containsKey(accumulatorName)) { - accumulators.get(accumulatorName).add(delta); - } - } - - @Override - public void incrementCounter(String counterGroup, String counterName, long delta) { - - incrementCounter(counterGroup, counterName, delta, accumulators); - } - - @Override - public void emit(String type, String from, String to) { - relations.add(new Tuple2<>(from, to)); - } - - public List> getRelations() { - return relations; - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java deleted file mode 100644 index 79a3114fda..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java +++ /dev/null @@ -1,84 +0,0 @@ - -package eu.dnetlib.dedup.graph; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Set; - -import org.apache.commons.lang.StringUtils; -import org.codehaus.jackson.annotate.JsonIgnore; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dedup.DedupUtility; -import eu.dnetlib.pace.util.PaceException; - -public class ConnectedComponent implements Serializable { - - private Set docIds; - private String ccId; - - public ConnectedComponent() { - } - - public ConnectedComponent(Set docIds) { - this.docIds = docIds; - createID(); - } - - public String createID() { - if (docIds.size() > 1) { - final String s = getMin(); - String prefix = s.split("\\|")[0]; - ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); - return ccId; - } else { - return docIds.iterator().next(); - } - } - - @JsonIgnore - public String getMin() { - - final StringBuilder min = new StringBuilder(); - docIds - .forEach( - i -> { - if (StringUtils.isBlank(min.toString())) { - min.append(i); - } else { - if (min.toString().compareTo(i) > 0) { - min.setLength(0); - min.append(i); - } - } - }); - return min.toString(); - } - - @Override - public String toString() { - ObjectMapper mapper = new ObjectMapper(); - try { - return mapper.writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Failed to create Json: ", e); - } - } - - public Set getDocIds() { - return docIds; - } - - public void setDocIds(Set docIds) { - this.docIds = docIds; - } - - public String getCcId() { - return ccId; - } - - public void setCcId(String ccId) { - this.ccId = ccId; - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala deleted file mode 100644 index 38c6951528..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala +++ /dev/null @@ -1,37 +0,0 @@ -package eu.dnetlib.dedup.graph - -import org.apache.spark.graphx._ -import org.apache.spark.rdd.RDD - -import scala.collection.JavaConversions; - -object GraphProcessor { - - def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = { - val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby - val cc = graph.connectedComponents(maxIterations).vertices - - val joinResult = vertexes.leftOuterJoin(cc).map { - case (id, (openaireId, cc)) => { - if (cc.isEmpty) { - (id, openaireId) - } - else { - (cc.get, openaireId) - } - } - } - val connectedComponents = joinResult.groupByKey() - .map[ConnectedComponent](cc => asConnectedComponent(cc)) - connectedComponents - } - - - - def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = { - val docs = group._2.toSet[String] - val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs)); - connectedComponent - } - -} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java deleted file mode 100644 index 3134f94000..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java +++ /dev/null @@ -1,78 +0,0 @@ - -package eu.dnetlib.dedup.sx; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.scholexplorer.OafUtils; -import scala.Tuple2; - -public class SparkPropagateRelationsJob { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkPropagateRelationsJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkUpdateEntityJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final String relationPath = parser.get("relationPath"); - final String mergeRelPath = parser.get("mergeRelPath"); - final String targetRelPath = parser.get("targetRelPath"); - - final Dataset merge = spark - .read() - .load(mergeRelPath) - .as(Encoders.bean(Relation.class)) - .where("relClass == 'merges'"); - - final Dataset rels = spark - .read() - .load(relationPath) - .as(Encoders.kryo(Relation.class)) - .map( - (MapFunction) r -> r, - Encoders.bean(Relation.class)); - - final Dataset firstJoin = rels - .joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer") - .map( - (MapFunction, Relation>) r -> { - final Relation mergeRelation = r._2(); - final Relation relation = r._1(); - if (mergeRelation != null) - relation.setSource(mergeRelation.getSource()); - if (relation.getDataInfo() == null) - relation.setDataInfo(OafUtils.generateDataInfo("0.9", false)); - return relation; - }, - Encoders.bean(Relation.class)); - - final Dataset secondJoin = firstJoin - .joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer") - .map( - (MapFunction, Relation>) r -> { - final Relation mergeRelation = r._2(); - final Relation relation = r._1(); - if (mergeRelation != null) - relation.setTarget(mergeRelation.getSource()); - return relation; - }, - Encoders.kryo(Relation.class)); - - secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath); - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java deleted file mode 100644 index a847ad6125..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java +++ /dev/null @@ -1,102 +0,0 @@ - -package eu.dnetlib.dedup.sx; - -import java.io.IOException; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.*; - -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; -import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; -import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; -import eu.dnetlib.dhp.utils.DHPUtils; -import scala.Tuple2; - -public class SparkUpdateEntityJob { - - static final String IDJSONPATH = "$.id"; - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkUpdateEntityJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkUpdateEntityJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String entityPath = parser.get("entityPath"); - final String mergeRelPath = parser.get("mergeRelPath"); - final String dedupRecordPath = parser.get("dedupRecordPath"); - final String entity = parser.get("entity"); - final String destination = parser.get("targetPath"); - - final Dataset df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = df - .where("relClass == 'merges'") - .select(df.col("target")) - .distinct() - .toJavaRDD() - .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); - final JavaRDD sourceEntity = sc.textFile(entityPath); - - final JavaRDD dedupEntity = sc.textFile(dedupRecordPath); - JavaPairRDD entitiesWithId = sourceEntity - .mapToPair( - (PairFunction) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s)); - Class mainClass; - switch (entity) { - case "publication": - mainClass = DLIPublication.class; - break; - case "dataset": - mainClass = DLIDataset.class; - break; - case "unknown": - mainClass = DLIUnknown.class; - break; - default: - throw new IllegalArgumentException("Illegal type " + entity); - } - JavaRDD map = entitiesWithId - .leftOuterJoin(mergedIds) - .map( - k -> k._2()._2().isPresent() - ? updateDeletedByInference(k._2()._1(), mainClass) - : k._2()._1()); - map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class); - } - - private static String updateDeletedByInference( - final String json, final Class clazz) { - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - try { - Oaf entity = mapper.readValue(json, clazz); - if (entity.getDataInfo() == null) - entity.setDataInfo(new DataInfo()); - entity.getDataInfo().setDeletedbyinference(true); - return mapper.writeValueAsString(entity); - } catch (IOException e) { - throw new RuntimeException("Unable to convert json", e); - } - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityWithDedupInfo.scala b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityWithDedupInfo.scala deleted file mode 100644 index ce883e2072..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityWithDedupInfo.scala +++ /dev/null @@ -1,75 +0,0 @@ -package eu.dnetlib.dedup.sx - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation} -import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown, OafUtils} -import org.apache.commons.io.IOUtils -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} -import org.slf4j.LoggerFactory -import org.apache.spark.sql.functions.col - -object SparkUpdateEntityWithDedupInfo { - - def main(args: Array[String]): Unit = { - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityWithDedupInfo.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json"))) - val logger = LoggerFactory.getLogger(SparkUpdateEntityWithDedupInfo.getClass) - parser.parseArgument(args) - - val workingPath: String = parser.get("workingPath") - logger.info(s"Working dir path = $workingPath") - - implicit val oafEncoder: Encoder[OafEntity] = Encoders.kryo[OafEntity] - implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation]) - - implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication] - implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset] - implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown] - - - - val spark: SparkSession = SparkSession - .builder() - .appName(SparkUpdateEntityWithDedupInfo.getClass.getSimpleName) - .master(parser.get("master")) - .getOrCreate() - - - val entityPath = parser.get("entityPath") - val mergeRelPath = parser.get("mergeRelPath") - val dedupRecordPath = parser.get("dedupRecordPath") - val entity = parser.get("entity") - val destination = parser.get("targetPath") - - val mergedIds = spark.read.load(mergeRelPath).as[Relation] - .where("relClass == 'merges'") - .select(col("target")) - - - val entities: Dataset[(String, OafEntity)] = spark - .read - .load(entityPath).as[OafEntity] - .map(o => (o.getId, o))(Encoders.tuple(Encoders.STRING, oafEncoder)) - - - val finalDataset:Dataset[OafEntity] = entities.joinWith(mergedIds, entities("_1").equalTo(mergedIds("target")), "left") - .map(k => { - val e: OafEntity = k._1._2 - val t = k._2 - if (t != null && t.getString(0).nonEmpty) { - if (e.getDataInfo == null) { - e.setDataInfo(OafUtils.generateDataInfo()) - } - e.getDataInfo.setDeletedbyinference(true) - } - e - }) - - val dedupRecords :Dataset[OafEntity] = spark.read.load(dedupRecordPath).as[OafEntity] - - finalDataset.union(dedupRecords) - .repartition(1200).write - .mode(SaveMode.Overwrite).save(destination) - - } - -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json deleted file mode 100644 index de744dfb63..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the path of the sequential file to read", - "paramRequired": true - }, - { - "paramName": "e", - "paramLongName": "entity", - "paramDescription": "the type of entity to be deduped", - "paramRequired": true - }, - { - "paramName": "c", - "paramLongName": "dedupConf", - "paramDescription": "dedup configuration to be used", - "compressed": true, - "paramRequired": true - }, - { - "paramName": "d", - "paramLongName": "dedupPath", - "paramDescription": "dedup path to load mergeRelation", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json deleted file mode 100644 index 69428a2963..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json +++ /dev/null @@ -1,38 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "ep", - "paramLongName": "entityPath", - "paramDescription": "the input entity path", - "paramRequired": true - }, - { - "paramName": "mr", - "paramLongName": "mergeRelPath", - "paramDescription": "the input path of merge Rel", - "paramRequired": true - }, - { - "paramName": "dr", - "paramLongName": "dedupRecordPath", - "paramDescription": "the inputPath of dedup record", - "paramRequired": true - }, - { - "paramName": "e", - "paramLongName": "entity", - "paramDescription": "the type of entity", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "targetPath", - "paramDescription": "the targetPath", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json deleted file mode 100644 index 8ba8515d0e..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the path of the sequential file to read", - "paramRequired": true - }, - { - "paramName": "e", - "paramLongName": "entity", - "paramDescription": "the type of entity to be deduped", - "paramRequired": true - }, - { - "paramName": "c", - "paramLongName": "dedupConf", - "paramDescription": "dedup configuration to be used", - "compressed": true, - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "targetPath", - "paramDescription": "target path to save dedup result", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json deleted file mode 100644 index 2ce78440fb..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "ep", - "paramLongName": "relationPath", - "paramDescription": "the input relation path", - "paramRequired": true - }, - { - "paramName": "mr", - "paramLongName": "mergeRelPath", - "paramDescription": "the input path of merge Rel", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "targetRelPath", - "paramDescription": "the output Rel Path", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/config-default.xml deleted file mode 100644 index 2e0ed9aeea..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/config-default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/workflow.xml deleted file mode 100644 index 2214fd20ac..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/workflow.xml +++ /dev/null @@ -1,182 +0,0 @@ - - - - sourcePath - the source path - - - entity - the entity that should be processed - - - dedupConf - the dedup Configuration - - - targetPath - the target path - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Similarity Relations - eu.dnetlib.dedup.SparkCreateSimRels - dhp-dedup-scholexplorer-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --executor-cores=${sparkExecutorCores} - ${sparkExtraOPT} - - -mtyarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Connected Components - eu.dnetlib.dedup.SparkCreateConnectedComponent - dhp-dedup-scholexplorer-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --executor-cores=${sparkExecutorCores} - ${sparkExtraOPT} - - -mtyarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Dedup Record - eu.dnetlib.dedup.SparkCreateDedupRecord - dhp-dedup-scholexplorer-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --executor-cores=${sparkExecutorCores} - ${sparkExtraOPT} - - -mtyarn-cluster - --sourcePath${sourcePath} - --dedupPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Propagate Dedup Relations - eu.dnetlib.dedup.sx.SparkPropagateRelationsJob - dhp-dedup-scholexplorer-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --executor-cores=${sparkExecutorCores} - ${sparkExtraOPT} - - -mtyarn-cluster - --mergeRelPath${targetPath}/${entity}/mergeRel - --relationPath${sourcePath}/relation - --targetRelPath${targetPath}/${entity}/updated_relation - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Update ${entity} and add DedupRecord - eu.dnetlib.dedup.sx.SparkUpdateEntityWithDedupInfo - dhp-dedup-scholexplorer-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --executor-cores=${sparkExecutorCores} - ${sparkExtraOPT} - - -mtyarn-cluster - --entityPath${sourcePath}/${entity} - --mergeRelPath${targetPath}/${entity}/mergeRel - --entity${entity} - --dedupRecordPath${targetPath}/${entity}/dedup_records - --targetPath${targetPath}/${entity}/updated_record - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/sx/conf/pub_scholix.conf.json b/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/sx/conf/pub_scholix.conf.json deleted file mode 100644 index d914198534..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/sx/conf/pub_scholix.conf.json +++ /dev/null @@ -1,378 +0,0 @@ -{ - "wf": { - "threshold": "0.99", - "dedupRun": "001", - "entityType": "result", - "subEntityType": "resulttype", - "subEntityValue": "publication", - "orderField": "title", - "queueMaxSize": "2000", - "groupMaxSize": "100", - "maxChildren": "100", - "slidingWindowSize": "200", - "rootBuilder": [ - ], - "includeChildren": "true", - "maxIterations": 20, - "idPath": "$.id" - }, - "pace": { - "clustering": [ - { - "name": "ngrampairs", - "fields": [ - "title" - ], - "params": { - "max": "1", - "ngramLen": "3" - } - }, - { - "name": "suffixprefix", - "fields": [ - "title" - ], - "params": { - "max": "1", - "len": "3" - } - } - ], - "decisionTree": { - "start": { - "fields": [ - { - "field": "pid", - "comparator": "jsonListMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": { - "jpath_value": "$.value", - "jpath_classid": "$.qualifier.classid" - } - } - ], - "threshold": 0.5, - "aggregation": "AVG", - "positive": "MATCH", - "negative": "layer2", - "undefined": "layer2", - "ignoreUndefined": "true" - }, - "layer2": { - "fields": [ - { - "field": "title", - "comparator": "titleVersionMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": {} - }, - { - "field": "authors", - "comparator": "sizeMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": {} - } - ], - "threshold": 1.0, - "aggregation": "AND", - "positive": "layer3", - "negative": "NO_MATCH", - "undefined": "layer3", - "ignoreUndefined": "false" - }, - "layer3": { - "fields": [ - { - "field": "title", - "comparator": "levensteinTitle", - "weight": 1.0, - "countIfUndefined": "true", - "params": {} - } - ], - "threshold": 0.99, - "aggregation": "AVG", - "positive": "MATCH", - "negative": "NO_MATCH", - "undefined": "NO_MATCH", - "ignoreUndefined": "true" - } - }, - "model": [ - { - "name": "pid", - "type": "JSON", - "path": "$.pid", - "overrideMatch": "true" - }, - { - "name": "title", - "type": "String", - "path": "$.title[*].value", - "length": 250, - "size": 5 - }, - { - "name": "authors", - "type": "List", - "path": "$.author[*].fullname", - "size": 200 - }, - { - "name": "resulttype", - "type": "String", - "path": "$.resulttype.classid" - } - ], - "blacklists": { - "title": [ - "^Inside Front Cover$", - "^CORR Insights$", - "^Index des notions$", - "^Department of Error.$", - "^Untitled Item$", - "^Department of Error$", - "^Tome II : 1598 à 1605$", - "^(à l’exception de roi, prince, royauté, pouvoir, image… qui sont omniprésents)$", - "^Museen und Ausstellungsinstitute in Nürnberg$", - "^Text/Conference Paper$", - "^Table des illustrations$", - "^An Intimate Insight on Psychopathy and a Novel Hermeneutic Psychological Science$", - "^Index des noms$", - "^Reply by Authors.$", - "^Titelblatt - Inhalt$", - "^Index des œuvres,$", - "(?i)^Poster presentations$", - "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", - "^Problems with perinatal pathology\\.?$", - "(?i)^Cases? of Puerperal Convulsions$", - "(?i)^Operative Gyna?ecology$", - "(?i)^Mind the gap\\!?\\:?$", - "^Chronic fatigue syndrome\\.?$", - "^Cartas? ao editor Letters? to the Editor$", - "^Note from the Editor$", - "^Anesthesia Abstract$", - "^Annual report$", - "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", - "(?i)^Graph and Table of Infectious Diseases?$", - "^Presentation$", - "(?i)^Reviews and Information on Publications$", - "(?i)^PUBLIC HEALTH SERVICES?$", - "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", - "(?i)^Adrese autora$", - "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", - "(?i)^Acknowledgement to Referees$", - "(?i)^Behçet's disease\\.?$", - "(?i)^Isolation and identification of restriction endonuclease.*$", - "(?i)^CEREBROVASCULAR DISEASES?.?$", - "(?i)^Screening for abdominal aortic aneurysms?\\.?$", - "^Event management$", - "(?i)^Breakfast and Crohn's disease.*\\.?$", - "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", - "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", - "^Gushi hakubutsugaku$", - "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", - "^Intestinal spirocha?etosis$", - "^Treatment of Rodent Ulcer$", - "(?i)^\\W*Cloud Computing\\W*$", - "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", - "^Free Communications, Poster Presentations: Session [A-F]$", - "^“The Historical Aspects? of Quackery\\.?”$", - "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", - "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", - "(?i)^Case Report$", - "^Boletín Informativo$", - "(?i)^Glioblastoma Multiforme$", - "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", - "^Zaměstnanecké výhody$", - "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", - "(?i)^Carotid body tumours?\\.?$", - "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", - "^Avant-propos$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", - "(?i)^PUBLIC HEALTH VERSUS THE STATE$", - "^Viñetas de Cortázar$", - "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", - "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", - "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", - "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", - "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", - "^Aus der AGMB$", - "^Znanstveno-stručni prilozi$", - "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", - "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", - "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", - "^Finanční analýza podniku$", - "^Financial analysis( of business)?$", - "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", - "^Jikken nihon shūshinsho$", - "(?i)^CORONER('|s)(s|') INQUESTS$", - "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", - "(?i)^Consultants' contract(s)?$", - "(?i)^Upute autorima$", - "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", - "^Joshi shin kokubun$", - "^Kōtō shōgaku dokuhon nōson'yō$", - "^Jinjō shōgaku shōka$", - "^Shōgaku shūjichō$", - "^Nihon joshi dokuhon$", - "^Joshi shin dokuhon$", - "^Chūtō kanbun dokuhon$", - "^Wabun dokuhon$", - "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", - "(?i)^cardiac rehabilitation$", - "(?i)^Analytical summary$", - "^Thesaurus resolutionum Sacrae Congregationis Concilii$", - "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", - "^Prikazi i osvrti$", - "^Rodinný dům s provozovnou$", - "^Family house with an establishment$", - "^Shinsei chūtō shin kokugun$", - "^Pulmonary alveolar proteinosis(\\.?)$", - "^Shinshū kanbun$", - "^Viñeta(s?) de Rodríguez$", - "(?i)^RUBRIKA UREDNIKA$", - "^A Matching Model of the Academic Publication Market$", - "^Yōgaku kōyō$", - "^Internetový marketing$", - "^Internet marketing$", - "^Chūtō kokugo dokuhon$", - "^Kokugo dokuhon$", - "^Antibiotic Cover for Dental Extraction(s?)$", - "^Strategie podniku$", - "^Strategy of an Enterprise$", - "(?i)^respiratory disease(s?)(\\.?)$", - "^Award(s?) for Gallantry in Civil Defence$", - "^Podniková kultura$", - "^Corporate Culture$", - "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", - "^Pracovní motivace$", - "^Work Motivation$", - "^Kaitei kōtō jogaku dokuhon$", - "^Konsolidovaná účetní závěrka$", - "^Consolidated Financial Statements$", - "(?i)^intracranial tumour(s?)$", - "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", - "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", - "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", - "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", - "^Úroveň motivačního procesu jako způsobu vedení lidí$", - "^The level of motivation process as a leadership$", - "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", - "(?i)^news and events$", - "(?i)^NOVOSTI I DOGAĐAJI$", - "^Sansū no gakushū$", - "^Posouzení informačního systému firmy a návrh změn$", - "^Information System Assessment and Proposal for ICT Modification$", - "^Stresové zatížení pracovníků ve vybrané profesi$", - "^Stress load in a specific job$", - "^Sunday: Poster Sessions, Pt.*$", - "^Monday: Poster Sessions, Pt.*$", - "^Wednesday: Poster Sessions, Pt.*", - "^Tuesday: Poster Sessions, Pt.*$", - "^Analýza reklamy$", - "^Analysis of advertising$", - "^Shōgaku shūshinsho$", - "^Shōgaku sansū$", - "^Shintei joshi kokubun$", - "^Taishō joshi kokubun dokuhon$", - "^Joshi kokubun$", - "^Účetní uzávěrka a účetní závěrka v ČR$", - "(?i)^The \"?Causes\"? of Cancer$", - "^Normas para la publicación de artículos$", - "^Editor('|s)(s|') [Rr]eply$", - "^Editor(’|s)(s|’) letter$", - "^Redaktoriaus žodis$", - "^DISCUSSION ON THE PRECEDING PAPER$", - "^Kōtō shōgaku shūshinsho jidōyō$", - "^Shōgaku nihon rekishi$", - "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", - "^Préface$", - "^Occupational [Hh]ealth [Ss]ervices.$", - "^In Memoriam Professor Toshiyuki TAKESHIMA$", - "^Účetní závěrka ve vybraném podniku.*$", - "^Financial statements in selected company$", - "^Abdominal [Aa]ortic [Aa]neurysms.*$", - "^Pseudomyxoma peritonei$", - "^Kazalo autora$", - "(?i)^uvodna riječ$", - "^Motivace jako způsob vedení lidí$", - "^Motivation as a leadership$", - "^Polyfunkční dům$", - "^Multi\\-funkcional building$", - "^Podnikatelský plán$", - "(?i)^Podnikatelský záměr$", - "(?i)^Business Plan$", - "^Oceňování nemovitostí$", - "^Marketingová komunikace$", - "^Marketing communication$", - "^Sumario Analítico$", - "^Riječ uredništva$", - "^Savjetovanja i priredbe$", - "^Índice$", - "^(Starobosanski nadpisi).*$", - "^Vzdělávání pracovníků v organizaci$", - "^Staff training in organization$", - "^(Life Histories of North American Geometridae).*$", - "^Strategická analýza podniku$", - "^Strategic Analysis of an Enterprise$", - "^Sadržaj$", - "^Upute suradnicima$", - "^Rodinný dům$", - "(?i)^Fami(l)?ly house$", - "^Upute autorima$", - "^Strategic Analysis$", - "^Finanční analýza vybraného podniku$", - "^Finanční analýza$", - "^Riječ urednika$", - "(?i)^Content(s?)$", - "(?i)^Inhalt$", - "^Jinjō shōgaku shūshinsho jidōyō$", - "(?i)^Index$", - "^Chūgaku kokubun kyōkasho$", - "^Retrato de una mujer$", - "^Retrato de un hombre$", - "^Kōtō shōgaku dokuhon$", - "^Shotōka kokugo$", - "^Shōgaku dokuhon$", - "^Jinjō shōgaku kokugo dokuhon$", - "^Shinsei kokugo dokuhon$", - "^Teikoku dokuhon$", - "^Instructions to Authors$", - "^KİTAP TAHLİLİ$", - "^PRZEGLĄD PIŚMIENNICTWA$", - "(?i)^Presentación$", - "^İçindekiler$", - "(?i)^Tabl?e of contents$", - "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", - "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", - "^Editorial( Board)?$", - "(?i)^Editorial \\(English\\)$", - "^Editörden$", - "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", - "^(Kiri Karl Morgensternile).*$", - "^(\\[Eksliibris Aleksandr).*\\]$", - "^(\\[Eksliibris Aleksandr).*$", - "^(Eksliibris Aleksandr).*$", - "^(Kiri A\\. de Vignolles).*$", - "^(2 kirja Karl Morgensternile).*$", - "^(Pirita kloostri idaosa arheoloogilised).*$", - "^(Kiri tundmatule).*$", - "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", - "^(Eksliibris Nikolai Birukovile).*$", - "^(Eksliibris Nikolai Issakovile).*$", - "^(WHP Cruise Summary Information of section).*$", - "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", - "^(Measurement of the spin\\-dependent structure function).*", - "(?i)^.*authors['’′]? reply\\.?$", - "(?i)^.*authors['’′]? response\\.?$" - ] - }, - "synonyms": {} - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml deleted file mode 100644 index 188e0debcf..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml +++ /dev/null @@ -1,79 +0,0 @@ - - - - dhp-workflows - eu.dnetlib.dhp - 1.2.4-SNAPSHOT - - 4.0.0 - - dhp-graph-provision-scholexplorer - - - - - net.alchim31.maven - scala-maven-plugin - 4.0.1 - - - scala-compile-first - initialize - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - ${scala.version} - - - - - - - - - org.apache.spark - spark-core_2.11 - - - - org.apache.spark - spark-sql_2.11 - - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - - org.apache.httpcomponents - httpmime - - - - org.elasticsearch - elasticsearch-hadoop - - - - - org.apache.httpcomponents - httpclient - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala deleted file mode 100644 index b71b7f0549..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ /dev/null @@ -1,425 +0,0 @@ -package eu.dnetlib.dhp.export - -import com.fasterxml.jackson.databind.ObjectMapper - -import java.time.LocalDateTime -import java.time.format.DateTimeFormatter -import eu.dnetlib.dhp.common.PacePerson -import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} -import eu.dnetlib.dhp.utils.DHPUtils -import org.apache.commons.lang3.StringUtils - - -import scala.collection.JavaConverters._ - - -case class DLIExternalReference(id: String, url: String, sitename: String, label: String, pid: String, classId: String) {} - -object DLIToOAF { - - - val collectedFromMap: Map[String, KeyValue] = Map( - "dli_________::r3d100010527" -> generateKeyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive"), - "dli_________::r3d100010255" -> generateKeyValue("10|re3data_____::480d275ed6f9666ee76d6a1215eabf26", "Inter-university Consortium for Political and Social Research"), - "dli_________::r3d100011868" -> generateKeyValue("10|re3data_____::db814dc656a911b556dba42a331cebe9", "Mendeley Data"), - "dli_________::elsevier" -> generateKeyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier"), - "dli_________::openaire" -> generateKeyValue("10|infrastruct_::f66f1bd369679b5b077dcdf006089556", "OpenAIRE"), - "dli_________::thomsonreuters" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref"), - "dli_________::r3d100010216" -> generateKeyValue("10|re3data_____::0fd79429de04343dbbec705d9b5f429f", "4TU.Centre for Research Data"), - "dli_________::r3d100010134" -> generateKeyValue("10|re3data_____::9633d1e8c4309c833c2c442abeb0cfeb", "PANGAEA"), - "dli_________::ieee" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref"), - "dli_________::r3d100010197" -> generateKeyValue("10|re3data_____::9fd1d79973f7fda60cbe1d82e3819a68", "The Cambridge Structural Database"), - "dli_________::nature" -> generateKeyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature"), - "dli_________::datacite" -> generateKeyValue("10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite"), - "dli_________::r3d100010578" -> generateKeyValue("10|re3data_____::c4d751f29a7568011a4c80136b30b444", "IEDA"), - "dli_________::r3d100010464" -> generateKeyValue("10|re3data_____::23e2a81591099828f6b83a1c83150666", "Research Data Australia"), - "dli_________::r3d100010327" -> generateKeyValue("10|re3data_____::a644620b81135243dc9acc15d2362246", "Worldwide Protein Data Bank"), - "dli_________::pubmed" -> generateKeyValue("10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357", "PubMed Central"), - "dli_________::europe_pmc__" -> generateKeyValue("10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", "Europe PubMed Central"), - "dli_________::crossref" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref") - ) - - - val relationTypeMapping: Map[String, (String, String)] = Map( - "IsReferencedBy" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP), - "References" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP), - "IsRelatedTo" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP), - "IsSupplementedBy" -> (ModelConstants.IS_SUPPLEMENTED_BY, ModelConstants.SUPPLEMENT), - "Documents" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP), - "Cites" -> (ModelConstants.CITES, ModelConstants.CITATION), - "Unknown" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP), - "IsSourceOf" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP), - "IsCitedBy" -> (ModelConstants.IS_CITED_BY, ModelConstants.CITATION), - "Reviews" -> (ModelConstants.REVIEWS, ModelConstants.REVIEW), - "Describes" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP), - "HasAssociationWith" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP) - ) - - val expectecdPidType = List("uniprot", "ena", "chembl", "ncbi-n", "ncbi-p", "genbank", "pdb", "url") - - - val filteredURL = List( - "www.ebi.ac.uk", - "www.uniprot.org", - "f1000.com", - "en.wikipedia.org", - "flybase.org", - "www.yeastgenome.org", - "research.bioinformatics.udel.edu", - "cancer.sanger.ac.uk", - "www.iedb.org", - "www.crd.york.ac.uk", - "www.wormbase.org", - "web.expasy.org", - "www.hal.inserm.fr", - "sabiork.h-its.org", - "zfin.org", - "www.pombase.org", - "www.guidetopharmacology.org", - "reactome.org" - ) - - - val rel_inverse: Map[String, String] = Map( - ModelConstants.IS_RELATED_TO -> ModelConstants.IS_RELATED_TO, - ModelConstants.IS_SUPPLEMENTED_BY -> ModelConstants.IS_SUPPLEMENT_TO, - ModelConstants.CITES -> ModelConstants.IS_CITED_BY, - ModelConstants.IS_CITED_BY -> ModelConstants.CITES, - ModelConstants.REVIEWS -> ModelConstants.IS_REVIEWED_BY - ) - - - val PidTypeMap: Map[String, String] = Map( - "pbmid" -> "pmid", - "pmcid" -> "pmc", - "pmid" -> "pmid", - "pubmedid" -> "pmid", - "DOI" -> "doi", - "doi" -> "doi" - ) - - - def fixInstance(r:Publication) :Publication = { - val collectedFrom = r.getCollectedfrom.asScala.head - r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom)) - r - } - - - def fixInstanceDataset(r:Dataset) :Dataset = { - val collectedFrom = r.getCollectedfrom.asScala.head - r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom)) - r - } - - - def toActionSet(item: Oaf): (String, String) = { - val mapper = new ObjectMapper() - - item match { - case dataset: Dataset => - val a: AtomicAction[Dataset] = new AtomicAction[Dataset] - a.setClazz(classOf[Dataset]) - a.setPayload(dataset) - (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case publication: Publication => - val a: AtomicAction[Publication] = new AtomicAction[Publication] - a.setClazz(classOf[Publication]) - a.setPayload(publication) - (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case relation: Relation => - val a: AtomicAction[Relation] = new AtomicAction[Relation] - a.setClazz(classOf[Relation]) - a.setPayload(relation) - (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case _ => - null - } - } - - def convertClinicalTrial(dataset: DLIDataset): (String, String) = { - val currentId = generateId(dataset.getId) - val pids = dataset.getPid.asScala.filter(p => "clinicaltrials.gov".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => s"50|r3111dacbab5::${DHPUtils.md5(p.getValue.toLowerCase())}") - if (pids.isEmpty) - null - else - (currentId, pids.head) - } - - - def insertExternalRefs(publication: Publication, externalReferences: List[DLIExternalReference]): Publication = { - - val eRefs = externalReferences.map(e => { - val result = new ExternalReference() - result.setSitename(e.sitename) - result.setLabel(e.label) - result.setUrl(e.url) - result.setRefidentifier(e.pid) - result.setDataInfo(generateDataInfo()) - result.setQualifier(createQualifier(e.classId, ModelConstants.DNET_EXTERNAL_REFERENCE_TYPE)) - result - }) - publication.setExternalReference(eRefs.asJava) - publication - - } - - def filterPid(p: StructuredProperty): Boolean = { - if (expectecdPidType.contains(p.getQualifier.getClassname) && p.getQualifier.getClassname.equalsIgnoreCase("url")) - if (filteredURL.exists(u => p.getValue.contains(u))) - return true - else - return false - expectecdPidType.contains(p.getQualifier.getClassname) - } - - - def extractTitle(titles: java.util.List[StructuredProperty]): String = { - - if (titles == null) - return null - - val label = titles.asScala.map(p => p.getValue).find(p => p.nonEmpty) - label.orNull - } - - def convertDLIDatasetToExternalReference(dataset: DLIDataset): DLIExternalReference = { - val pids = dataset.getPid.asScala.filter(filterPid) - - if (pids == null || pids.isEmpty) - return null - - val pid: StructuredProperty = pids.head - - - pid.getQualifier.getClassname match { - case "uniprot" => DLIExternalReference(generateId(dataset.getId), s"https://www.uniprot.org/uniprot/${pid.getValue}", "UniProt", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") - case "ena" => - if (pid.getValue != null && pid.getValue.nonEmpty && pid.getValue.length > 7) - DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/ena/data/view/${pid.getValue.substring(0, 8)}", "European Nucleotide Archive", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") - else - null - case "chembl" => DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/chembl/compound_report_card/${pid.getValue}", "ChEMBL", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") - case "ncbi-n" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Nucleotide Database", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") - case "ncbi-p" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Nucleotide Database", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") - case "genbank" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "GenBank", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") - case "pdb" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Protein Data Bank", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") - case "url" => DLIExternalReference(generateId(dataset.getId), pid.getValue, "", extractTitle(dataset.getTitle), pid.getValue, "url") - - } - - - } - - - def convertDLIPublicationToOAF(inputPublication: DLIPublication): Publication = { - val result = new Publication - val cleanedPids = inputPublication.getPid.asScala.filter(p => PidTypeMap.contains(p.getQualifier.getClassid)) - .map(p => { - p.setQualifier(createQualifier(PidTypeMap(p.getQualifier.getClassid), p.getQualifier.getSchemeid)) - p - }) - if (cleanedPids.isEmpty) - return null - result.setId(generateId(inputPublication.getId)) - result.setDataInfo(generateDataInfo(invisible = true)) - if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null)) - return null - result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava) - if(result.getCollectedfrom.isEmpty) - return null - result.setPid(cleanedPids.asJava) - result.setDateofcollection(inputPublication.getDateofcollection) - result.setOriginalId(inputPublication.getPid.asScala.map(p => p.getValue).asJava) - result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))) - if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty) - return null - result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava) - result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES)) - - if (inputPublication.getSubject != null) - result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava) - - if (inputPublication.getTitle == null || inputPublication.getTitle.isEmpty) - return null - - result.setTitle(List(patchTitle(inputPublication.getTitle.get(0))).asJava) - - if (inputPublication.getRelevantdate == null || inputPublication.getRelevantdate.size() == 0) - return null - - result.setRelevantdate(inputPublication.getRelevantdate.asScala.map(patchRelevantDate).asJava) - - - result.setDescription(inputPublication.getDescription) - - result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue)) - result.setPublisher(inputPublication.getPublisher) - result.setSource(inputPublication.getSource) - result.setBestaccessright(createAccessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)) - - val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue) - if (dois.isEmpty) - return null - - - val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(inputPublication.getInstance()), result.getDateofacceptance) - - if (i != null) - result.setInstance(List(i).asJava) - - result - } - - - def convertDLIRelation(r: Relation): Relation = { - - val rt = r.getRelType - if (!relationTypeMapping.contains(rt)) - return null - r.setRelType(ModelConstants.RESULT_RESULT) - r.setRelClass(relationTypeMapping(rt)._1) - r.setSubRelType(relationTypeMapping(rt)._2) - r.setSource(generateId(r.getSource)) - r.setTarget(generateId(r.getTarget)) - r - } - - - def convertDLIDatasetTOOAF(d: DLIDataset): Dataset = { - - if (d.getCollectedfrom == null || d.getCollectedfrom.size() == 0 || (d.getCollectedfrom.size() == 1 && d.getCollectedfrom.get(0) == null)) - return null - val result: Dataset = new Dataset - result.setId(generateId(d.getId)) - result.setDataInfo(generateDataInfo()) - result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava) - if(result.getCollectedfrom.isEmpty) - return null - - - result.setPid(d.getPid) - - val fpids = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname) || - "pdb".equalsIgnoreCase(p.getQualifier.getClassname) - ).map(p => p.getValue) - - if (fpids == null || fpids.isEmpty) - return null - - - result.setDateofcollection(d.getDateofcollection) - result.setOriginalId(d.getPid.asScala.map(d => d.getValue).asJava) - result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))) - if (d.getAuthor == null || d.getAuthor.isEmpty) - return null - result.setAuthor(d.getAuthor.asScala.map(convertAuthor).asJava) - result.setResulttype(createQualifier(d.getResulttype.getClassid, d.getResulttype.getClassname, ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES)) - - if (d.getSubject != null) - result.setSubject(d.getSubject.asScala.map(convertSubject).asJava) - - if (d.getTitle == null || d.getTitle.isEmpty) - return null - - result.setTitle(List(patchTitle(d.getTitle.get(0))).asJava) - - if (d.getRelevantdate == null || d.getRelevantdate.size() == 0) - return null - - result.setRelevantdate(d.getRelevantdate.asScala.map(patchRelevantDate).asJava) - - - result.setDescription(d.getDescription) - - result.setDateofacceptance(asField(d.getRelevantdate.get(0).getValue)) - result.setPublisher(d.getPublisher) - result.setSource(d.getSource) - result.setBestaccessright(createAccessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)) - - - val instance_urls = if (fpids.head.length < 5) s"https://www.rcsb.org/structure/${fpids.head}" else s"https://dx.doi.org/${fpids.head}" - - val i: Instance = createInstance(instance_urls, firstInstanceOrNull(d.getInstance()), result.getDateofacceptance, true) - - // Ticket #6281 added pid to Instance - i.setPid(result.getPid) - if (i != null) - result.setInstance(List(i).asJava) - - result - } - - - def firstInstanceOrNull(instances: java.util.List[Instance]): Instance = { - - if (instances == null || instances.size() == 0) - return null - instances.get(0) - - } - - - def createInstance(url: String, originalInstance: Instance, doa: Field[String], dataset: Boolean = false): Instance = { - - val i = new Instance - i.setUrl(List(url).asJava) - if (dataset) - i.setInstancetype(createQualifier("0021", "Dataset", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) - else - i.setInstancetype(createQualifier("0000", "Unknown", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) - if (originalInstance != null && originalInstance.getHostedby != null) - i.setHostedby(originalInstance.getHostedby) - - i.setAccessright(createAccessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)) - i.setDateofacceptance(doa) - - i - - - } - - - def patchRelevantDate(d: StructuredProperty): StructuredProperty = { - d.setQualifier(createQualifier(ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE)) - d - - } - - def patchTitle(t: StructuredProperty): StructuredProperty = { - t.setQualifier(ModelConstants.MAIN_TITLE_QUALIFIER) - t - } - - - def convertSubject(s: StructuredProperty): StructuredProperty = { - s.setQualifier(createQualifier("keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)) - s - - - } - - - def convertAuthor(a: Author): Author = { - if (a == null) - return a - val p = new PacePerson(a.getFullname, false) - if (p.isAccurate) { - a.setName(p.getNameString) - a.setSurname(p.getSurnameString) - } - a - } - - - def generateId(id: String): String = { - val md5 = if (id.contains("::")) StringUtils.substringAfter(id, "::") else StringUtils.substringAfter(id, "|") - s"50|scholix_____::$md5" - } - - - - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala deleted file mode 100644 index 3f632af226..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala +++ /dev/null @@ -1,175 +0,0 @@ -package eu.dnetlib.dhp.`export` - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} -import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication} -import org.apache.commons.io.IOUtils -import org.apache.hadoop.io.Text -import org.apache.hadoop.io.compress.GzipCodec -import org.apache.hadoop.mapred.SequenceFileOutputFormat -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.expressions.Window -import org.apache.spark.SparkConf - -import scala.collection.mutable.ArrayBuffer - -object SparkExportContentForOpenAire { - - def main(args: Array[String]): Unit = { - val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExportContentForOpenAire.getClass.getResourceAsStream("input_export_content_parameters.json"))) - parser.parseArgument(args) - val spark: SparkSession = - SparkSession - .builder() - .config(conf) - .appName(SparkExportContentForOpenAire.getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - - - val workingPath = parser.get("workingDirPath") - - implicit val dliPubEncoder: Encoder[DLIPublication] = Encoders.kryo(classOf[DLIPublication]) - implicit val dliDatEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset]) - implicit val pubEncoder: Encoder[Publication] = Encoders.bean(classOf[Publication]) - implicit val datEncoder: Encoder[OafDataset] = Encoders.bean(classOf[OafDataset]) - implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation]) - - import spark.implicits._ - - val dsRel = spark.read.load(s"$workingPath/relation_b").as[Relation] - dsRel.filter(r => r.getDataInfo==null || r.getDataInfo.getDeletedbyinference ==false) - .map(DLIToOAF.convertDLIRelation) - .filter(r => r!= null) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS") - - - val dsPubs = spark.read.load(s"$workingPath/publication").as[DLIPublication] - dsPubs - .filter(p=>p.getDataInfo.getDeletedbyinference == false) - .map(DLIToOAF.convertDLIPublicationToOAF) - .filter(p=>p!= null) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationDS") - - - val dsDataset = spark.read.load(s"$workingPath/dataset").as[DLIDataset] - dsDataset - .filter(p => p.getDataInfo.getDeletedbyinference == false) - .map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetDS") - - - - - val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/export/publicationDS").as[Publication] - val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/export/datasetDS").as[OafDataset] - val relDS1 :Dataset[Relation] = spark.read.load(s"$workingPath/export/relationDS").as[Relation] - - - val pub_id = pubs.select("id").distinct() - val dat_id = dats.select("id").distinct() - - - pub_id.joinWith(relDS1, pub_id("id").equalTo(relDS1("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS_f1") - - val relDS2= spark.read.load(s"$workingPath/export/relationDS_f1").as[Relation] - - relDS2.joinWith(dat_id, relDS2("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS_filtered") - - - val r_source = relDS2.select(relDS2("source")).distinct() - val r_target = relDS2.select(relDS2("target")).distinct() - - - val w2 = Window.partitionBy("id").orderBy("lastupdatetimestamp") - - pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1) - .withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row") - .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationDS_filtered") - - dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1) - .withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row") - .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetAS") - - - dsDataset.map(DLIToOAF.convertDLIDatasetToExternalReference).filter(p => p != null).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/externalReference") - - val pf = spark.read.load(s"$workingPath/export/publicationDS_filtered").select("id") - val relDS3 = spark.read.load(s"$workingPath/export/relationDS").as[Relation] - val relationTo = pf.joinWith(relDS3, pf("id").equalTo(relDS3("source")),"inner").map(t =>t._2) - - val extRef = spark.read.load(s"$workingPath/export/externalReference").as[DLIExternalReference] - - spark.createDataset(relationTo.joinWith(extRef, relationTo("target").equalTo(extRef("id")), "inner").map(d => { - val r = d._1 - val ext = d._2 - (r.getSource, ext) - }).rdd.groupByKey.map(f => { - var dli_ext = ArrayBuffer[DLIExternalReference]() - f._2.foreach(d => if (dli_ext.size < 100) dli_ext += d ) - (f._1, dli_ext) - })).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/externalReference_grouped") - - val pubf :Dataset[Publication] = spark.read.load(s"$workingPath/export/publicationDS_filtered").as[Publication] - - val groupedERf:Dataset[(String, List[DLIExternalReference])]= spark.read.load(s"$workingPath/export/externalReference_grouped").as[(String, List[DLIExternalReference])] - - groupedERf.joinWith(pubf,pubf("id").equalTo(groupedERf("_1"))).map(t => - { - val publication = t._2 - if (t._1 != null) { - val eRefs = t._1._2 - DLIToOAF.insertExternalRefs(publication, eRefs) - - } else - publication - } - ).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationAS") - - - dsDataset - .map(DLIToOAF.convertClinicalTrial) - .filter(p => p != null) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/clinicalTrials") - - val ct:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/clinicalTrials").as[(String,String)] - - val relDS= spark.read.load(s"$workingPath/export/relationDS_f1").as[Relation] - - relDS.joinWith(ct, relDS("target").equalTo(ct("_1")), "inner") - .map(k =>{ - val currentRel = k._1 - currentRel.setTarget(k._2._2) - currentRel - }).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/clinicalTrialsRels") - - - val clRels:Dataset[Relation] = spark.read.load(s"$workingPath/export/clinicalTrialsRels").as[Relation] - val rels:Dataset[Relation] = spark.read.load(s"$workingPath/export/relationDS_filtered").as[Relation] - - rels.union(clRels).flatMap(r => { - val inverseRel = new Relation - inverseRel.setSource(r.getTarget) - inverseRel.setTarget(r.getSource) - inverseRel.setDataInfo(r.getDataInfo) - inverseRel.setCollectedfrom(r.getCollectedfrom) - inverseRel.setRelType(r.getRelType) - inverseRel.setSubRelType(r.getSubRelType) - inverseRel.setRelClass(DLIToOAF.rel_inverse(r.getRelClass)) - List(r, inverseRel) - }).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationAS") - - - - spark.read.load(s"$workingPath/export/publicationAS").as[Publication].map(DLIToOAF.fixInstance).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationAS_fixed") - spark.read.load(s"$workingPath/export/datasetAS").as[OafDataset].map(DLIToOAF.fixInstanceDataset).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetAS_fixed") - - val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/relationAS").as[Relation].map(DLIToOAF.toActionSet) - val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/publicationAS_fixed").as[Publication].map(DLIToOAF.toActionSet) - val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/datasetAS_fixed").as[OafDataset].map(DLIToOAF.toActionSet) - - fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/export/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) - } - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java deleted file mode 100644 index e19432f291..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java +++ /dev/null @@ -1,112 +0,0 @@ - -package eu.dnetlib.dhp.export.zenodo; - -import java.io.*; - -import org.apache.commons.compress.archivers.tar.TarArchiveEntry; -import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.MakeTarArchive; - -public class MakeTar implements Serializable { - - private static final Logger log = LoggerFactory.getLogger(MakeTar.class); - - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - MakeTar.class - .getResourceAsStream( - "/eu/dnetlib/dhp/export/input_maketar_parameters.json")); - - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); - - final String outputPath = parser.get("targetPath"); - log.info("hdfsPath: {}", outputPath); - - final String hdfsNameNode = parser.get("nameNode"); - log.info("nameNode: {}", hdfsNameNode); - - final String inputPath = parser.get("sourcePath"); - log.info("input path : {}", inputPath); - - Configuration conf = new Configuration(); - conf.set("fs.defaultFS", hdfsNameNode); - - FileSystem fileSystem = FileSystem.get(conf); - - MakeTarArchive.tarMaxSize(fileSystem, inputPath, outputPath, "scholix_dump", 25); - - } - -// public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException { -// -// RemoteIterator dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath)); -// -// while (dir_iterator.hasNext()) { -// LocatedFileStatus fileStatus = dir_iterator.next(); -// -// Path p = fileStatus.getPath(); -// String p_string = p.toString(); -// String entity = p_string.substring(p_string.lastIndexOf("/") + 1); -// -// write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity); -// } -// -// } -// -// private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name) -// throws IOException { -// -// Path hdfsWritePath = new Path(outputPath); -// FSDataOutputStream fsDataOutputStream = null; -// if (fileSystem.exists(hdfsWritePath)) { -// fileSystem.delete(hdfsWritePath, true); -// -// } -// fsDataOutputStream = fileSystem.create(hdfsWritePath); -// -// TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream()); -// -// RemoteIterator fileStatusListIterator = fileSystem -// .listFiles( -// new Path(inputPath), true); -// -// while (fileStatusListIterator.hasNext()) { -// LocatedFileStatus fileStatus = fileStatusListIterator.next(); -// -// Path p = fileStatus.getPath(); -// String p_string = p.toString(); -// if (!p_string.endsWith("_SUCCESS")) { -// String name = p_string.substring(p_string.lastIndexOf("/") + 1); -// TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz"); -// entry.setSize(fileStatus.getLen()); -// ar.putArchiveEntry(entry); -// -// InputStream is = fileSystem.open(fileStatus.getPath()); -// -// BufferedInputStream bis = new BufferedInputStream(is); -// -// int count; -// byte data[] = new byte[1024]; -// while ((count = bis.read(data, 0, data.length)) != -1) { -// ar.write(data, 0, count); -// } -// bis.close(); -// ar.closeArchiveEntry(); -// -// } -// -// } -// -// ar.close(); -// } - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java deleted file mode 100644 index 2e2b7bc266..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java +++ /dev/null @@ -1,80 +0,0 @@ - -package eu.dnetlib.dhp.export.zenodo; - -import java.io.Serializable; -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.*; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.api.MissingConceptDoiException; -import eu.dnetlib.dhp.common.api.ZenodoAPIClient; - -public class SendToZenodoHDFS implements Serializable { - - private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class); - - public static void main(final String[] args) throws Exception, MissingConceptDoiException { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SendToZenodoHDFS.class - .getResourceAsStream( - "/eu/dnetlib/dhp/export/upload_zenodo.json"))); - - parser.parseArgument(args); - - final String hdfsPath = parser.get("hdfsPath"); - final String hdfsNameNode = parser.get("nameNode"); - final String access_token = parser.get("accessToken"); - final String connection_url = parser.get("connectionUrl"); - final String metadata = parser.get("metadata"); - final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition")); - final String concept_rec_id = Optional - .ofNullable(parser.get("conceptRecordId")) - .orElse(null); - - Configuration conf = new Configuration(); - conf.set("fs.defaultFS", hdfsNameNode); - - FileSystem fileSystem = FileSystem.get(conf); - - RemoteIterator fileStatusListIterator = fileSystem - .listFiles( - new Path(hdfsPath), true); - ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token); - if (newDeposition) { - zenodoApiClient.newDeposition(); - } else { - if (concept_rec_id == null) { - throw new MissingConceptDoiException("No concept record id has been provided"); - } - zenodoApiClient.newVersion(concept_rec_id); - } - - while (fileStatusListIterator.hasNext()) { - LocatedFileStatus fileStatus = fileStatusListIterator.next(); - - Path p = fileStatus.getPath(); - String p_string = p.toString(); - if (!p_string.endsWith("_SUCCESS")) { - // String tmp = p_string.substring(0, p_string.lastIndexOf("/")); - String name = p_string.substring(p_string.lastIndexOf("/") + 1); - log.info("Sending information for community: " + name); - FSDataInputStream inputStream = fileSystem.open(p); - zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen()); - - } - - } - - zenodoApiClient.sendMretadata(metadata); -// zenodoApiClient.publish(); - - } - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DropAndCreateESIndex.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DropAndCreateESIndex.java deleted file mode 100644 index 7598fd9573..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DropAndCreateESIndex.java +++ /dev/null @@ -1,98 +0,0 @@ - -package eu.dnetlib.dhp.provision; - -import java.util.Map; - -import org.apache.commons.io.IOUtils; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpDelete; -import org.apache.http.client.methods.HttpPut; -import org.apache.http.entity.StringEntity; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -public class DropAndCreateESIndex { - - public static void main(String[] args) throws Exception { - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - DropAndCreateESIndex.class - .getResourceAsStream( - "/eu/dnetlib/dhp/provision/dropAndCreateIndex.json"))); - parser.parseArgument(args); - - final String index = parser.get("index"); - - final String cluster = parser.get("cluster"); - final String clusterJson = IOUtils - .toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/cluster.json")); - - final Map clusterMap = new ObjectMapper().readValue(clusterJson, Map.class); - - final String ip = clusterMap.get(cluster).split(",")[0]; - - System.out.println(ip); - - final String url = "http://%s:9200/%s_%s"; - - CloseableHttpClient client = HttpClients.createDefault(); - - HttpDelete delete = new HttpDelete(String.format(url, ip, index, "object")); - - CloseableHttpResponse response = client.execute(delete); - - System.out.println("deleting Index SUMMARY"); - System.out.println(response.getStatusLine()); - client.close(); - client = HttpClients.createDefault(); - - delete = new HttpDelete(String.format(url, ip, index, "scholix")); - - response = client.execute(delete); - - System.out.println("deleting Index SCHOLIX"); - System.out.println(response.getStatusLine()); - client.close(); - client = HttpClients.createDefault(); - - final String summaryConf = IOUtils - .toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/summary_index.json")); - - final String scholixConf = IOUtils - .toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/scholix_index.json")); - - HttpPut put = new HttpPut(String.format(url, ip, index, "object")); - - StringEntity entity = new StringEntity(summaryConf); - put.setEntity(entity); - put.setHeader("Accept", "application/json"); - put.setHeader("Content-type", "application/json"); - - System.out.println("creating First Index SUMMARY"); - response = client.execute(put); - - client.close(); - client = HttpClients.createDefault(); - - System.out.println(response.getStatusLine()); - - System.out.println("creating Index SCHOLIX"); - put = new HttpPut(String.format(url, ip, index, "scholix")); - - entity = new StringEntity(scholixConf); - put.setEntity(entity); - put.setHeader("Accept", "application/json"); - put.setHeader("Content-type", "application/json"); - - response = client.execute(put); - System.out.println(response.getStatusLine()); - client.close(); - - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java deleted file mode 100644 index 1b0cb4d055..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java +++ /dev/null @@ -1,48 +0,0 @@ - -package eu.dnetlib.dhp.provision; - -import org.apache.commons.lang3.StringUtils; - -import eu.dnetlib.dhp.provision.scholix.summary.Typology; -import eu.dnetlib.dhp.utils.DHPUtils; - -public class ProvisionUtil { - - public static final String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference"; - public static final String TARGETJSONPATH = "$.target"; - public static final String SOURCEJSONPATH = "$.source"; - - // public static RelatedItemInfo getItemType(final String item, final String idPath) { - // String targetId = DHPUtils.getJPathString(idPath, item); - // switch (StringUtils.substringBefore(targetId, "|")) { - // case "50": - // return new RelatedItemInfo(null,0,1,0); - // case "60": - // return new RelatedItemInfo(null,1,0,0); - // case "70": - // return new RelatedItemInfo(null,0,0,1); - // default: - // throw new RuntimeException("Unknonw target ID"); - // - // } - // - // } - - public static Boolean isNotDeleted(final String item) { - return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item)); - } - - public static Typology getItemTypeFromId(String id) { - - switch (StringUtils.substringBefore(id, "|")) { - case "50": - return Typology.publication; - case "60": - return Typology.dataset; - case "70": - return Typology.unknown; - default: - throw new RuntimeException("Unknonw ID type"); - } - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java deleted file mode 100644 index 28826612d7..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java +++ /dev/null @@ -1,59 +0,0 @@ - -package eu.dnetlib.dhp.provision; - -import java.io.Serializable; - -/** This class models the information of related items */ -public class RelatedItemInfo implements Serializable { - - private String source; - - private long relatedDataset = 0; - - private long relatedPublication = 0; - - private long relatedUnknown = 0; - - public RelatedItemInfo() { - } - - public RelatedItemInfo( - String source, long relatedDataset, long relatedPublication, long relatedUnknown) { - this.source = source; - this.relatedDataset = relatedDataset; - this.relatedPublication = relatedPublication; - this.relatedUnknown = relatedUnknown; - } - - public String getSource() { - return source; - } - - public void setSource(String source) { - this.source = source; - } - - public long getRelatedDataset() { - return relatedDataset; - } - - public void setRelatedDataset(long relatedDataset) { - this.relatedDataset = relatedDataset; - } - - public long getRelatedPublication() { - return relatedPublication; - } - - public void setRelatedPublication(long relatedPublication) { - this.relatedPublication = relatedPublication; - } - - public long getRelatedUnknown() { - return relatedUnknown; - } - - public void setRelatedUnknown(int relatedUnknown) { - this.relatedUnknown = relatedUnknown; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkConvertDatasetToJson.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkConvertDatasetToJson.scala deleted file mode 100644 index 8133666a67..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkConvertDatasetToJson.scala +++ /dev/null @@ -1,38 +0,0 @@ -package eu.dnetlib.dhp.provision - -import com.fasterxml.jackson.databind.ObjectMapper -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.provision.scholix.Scholix -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary -import org.apache.commons.io.IOUtils -import org.apache.hadoop.io.compress.GzipCodec -import org.apache.spark.SparkConf -import org.apache.spark.sql.{Encoder, Encoders, SparkSession} - -object SparkConvertDatasetToJson { - - def main(args: Array[String]): Unit = { - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertDatasetToJson.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/dataset2Json.json"))) - parser.parseArgument(args) - val conf = new SparkConf - val spark = SparkSession.builder.config(conf).appName(SparkConvertDatasetToJson.getClass.getSimpleName).master(parser.get("master")).getOrCreate - - implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] - implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] - - - val workingPath = parser.get("workingPath") - - - - spark.read.load(s"$workingPath/summary").as[ScholixSummary] - .map(s => new ObjectMapper().writeValueAsString(s))(Encoders.STRING) - .rdd.repartition(500).saveAsTextFile(s"$workingPath/summary_json", classOf[GzipCodec]) - - spark.read.load(s"$workingPath/scholix").as[Scholix] - .map(s => new ObjectMapper().writeValueAsString(s))(Encoders.STRING) - .rdd.repartition(2000).saveAsTextFile(s"$workingPath/scholix_json", classOf[GzipCodec]) - - } - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.scala deleted file mode 100644 index d6e36ac877..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.scala +++ /dev/null @@ -1,60 +0,0 @@ -package eu.dnetlib.dhp.provision - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.Relation -import org.apache.commons.io.IOUtils -import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} -import org.apache.spark.sql.functions.{coalesce, col, count, lit} - - -/** - * SparkExtractRelationCount is a spark job that takes in input relation RDD and retrieve for each item in relation - * which are the number of - Related Dataset - Related Publication - Related Unknown - */ -object SparkExtractRelationCount { - - - def main(args: Array[String]): Unit = { - - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExtractRelationCount.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_related_entities_parameters.json"))) - parser.parseArgument(args) - val spark = SparkSession.builder.appName(SparkExtractRelationCount.getClass.getSimpleName).master(parser.get("master")).getOrCreate - - val workingDirPath = parser.get("workingDirPath") - - val relationPath = parser.get("relationPath") - - implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] - - val relation = spark.read.load(relationPath).as[Relation].map(r =>r)(Encoders.bean(classOf[Relation])) - - val relatedPublication = relation - .where("target like '50%'") - .groupBy("source") - .agg(count("target").as("publication")) - .select(col("source"). alias("p_source"), col("publication")) - val relatedDataset = relation - .where("target like '60%'") - .groupBy("source") - .agg(count("target").as("dataset")) - .select(col("source"). alias("d_source"), col("dataset")) - val relatedUnknown = relation - .where("target like '70%'") - .groupBy("source") - .agg(count("target").as("unknown")) - .select(col("source"). alias("u_source"), col("unknown")) - val firstJoin = relatedPublication - .join(relatedDataset,col("p_source").equalTo(col("d_source")),"full") - .select( coalesce( col("p_source"), col("d_source")).alias("id"), - col("publication"), - col("dataset")) - .join(relatedUnknown, col("u_source").equalTo(col("id")),"full") - .select( coalesce(col("u_source"), col("id")).alias("source"), - coalesce(col("publication"),lit(0)).alias("relatedPublication"), - coalesce(col("dataset"),lit(0)).alias("relatedDataset"), - coalesce(col("unknown"),lit(0)).alias("relatedUnknown") - ) - firstJoin.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/relatedItemCount") - } - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholixIndex.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholixIndex.scala deleted file mode 100644 index d39e38bfcc..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholixIndex.scala +++ /dev/null @@ -1,94 +0,0 @@ -package eu.dnetlib.dhp.provision - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixResource} -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary -import eu.dnetlib.dhp.schema.oaf.Relation -import org.apache.commons.io.IOUtils -import org.apache.spark.SparkConf -import org.apache.spark.sql.expressions.Aggregator -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} - -object SparkGenerateScholixIndex { - - - - def getScholixAggregator(): Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix]{ - - override def zero: Scholix = new Scholix() - - override def reduce(b: Scholix, a: (String, Scholix)): Scholix = { - b.mergeFrom(a._2) - b - } - - override def merge(wx: Scholix, wy: Scholix): Scholix = { - wx.mergeFrom(wy) - wx - } - override def finish(reduction: Scholix): Scholix = reduction - - override def bufferEncoder: Encoder[Scholix] = - Encoders.kryo(classOf[Scholix]) - - override def outputEncoder: Encoder[Scholix] = - Encoders.kryo(classOf[Scholix]) - } - - - def main(args: Array[String]): Unit = { - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholixIndex.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))) - parser.parseArgument(args) - val conf = new SparkConf - conf.set("spark.sql.shuffle.partitions", "4000") - val spark = SparkSession.builder.config(conf).appName(SparkGenerateScholixIndex.getClass.getSimpleName).master(parser.get("master")).getOrCreate - - val graphPath = parser.get("graphPath") - val workingDirPath = parser.get("workingDirPath") - - - implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] - implicit val relEncoder:Encoder[Relation] = Encoders.kryo[Relation] - implicit val scholixEncoder:Encoder[Scholix] = Encoders.kryo[Scholix] - implicit val tupleScholix:Encoder[(String,Scholix)]=Encoders.tuple(Encoders.STRING, scholixEncoder) - - - val scholixSummary:Dataset[(String,ScholixSummary)] = spark.read.load(s"$workingDirPath/summary").as[ScholixSummary] - .map(s => (s.getId, s))(Encoders.tuple(Encoders.STRING, summaryEncoder)) - val sourceRelations:Dataset[(String,Relation)]= spark.read.load(s"$graphPath/relation").as[Relation] - .map(r => (r.getSource,r))(Encoders.tuple(Encoders.STRING, relEncoder)) - - scholixSummary.joinWith(sourceRelations, scholixSummary("_1").equalTo(sourceRelations("_1")), "inner") - .map(r=> { - val summary = r._1._2 - val relation = r._2._2 - - (relation.getTarget, Scholix.generateScholixWithSource(summary,relation)) - - }).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_source") - - val sTarget:Dataset[(String,Scholix)] = spark.read.load(s"$workingDirPath/scholix_source").as[(String, Scholix)] - - sTarget.joinWith(scholixSummary, sTarget("_1").equalTo(scholixSummary("_1")), "inner").map(i => { - val summary = i._2._2 - val scholix = i._1._2 - - val scholixResource = ScholixResource.fromSummary(summary) - scholix.setTarget(scholixResource) - scholix.generateIdentifier() - scholix.generatelinkPublisher() - scholix - }).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_r") - - - val finalScholix:Dataset[Scholix] = spark.read.load(s"$workingDirPath/scholix_r").as[Scholix] - - finalScholix.map(d => (d.getIdentifier, d))(Encoders.tuple(Encoders.STRING, scholixEncoder)) - .groupByKey(_._1)(Encoders.STRING) - .agg(getScholixAggregator().toColumn) - .map(p => p._2) - .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix") - - } - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummaryIndex.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummaryIndex.scala deleted file mode 100644 index bf3d0342b6..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummaryIndex.scala +++ /dev/null @@ -1,70 +0,0 @@ -package eu.dnetlib.dhp.provision - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary -import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation} -import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown} -import org.apache.commons.io.IOUtils -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} - -object SparkGenerateSummaryIndex { - - def main(args: Array[String]): Unit = { - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateSummaryIndex.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))) - parser.parseArgument(args) - val spark = SparkSession.builder.appName(SparkGenerateSummaryIndex.getClass.getSimpleName).master(parser.get("master")).getOrCreate - - val graphPath = parser.get("graphPath") - val workingDirPath = parser.get("workingDirPath") - - implicit val relatedItemInfoEncoders: Encoder[RelatedItemInfo] = Encoders.bean(classOf[RelatedItemInfo]) - implicit val datasetEncoder:Encoder[DLIDataset] = Encoders.kryo[DLIDataset] - implicit val publicationEncoder:Encoder[DLIPublication] = Encoders.kryo[DLIPublication] - implicit val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation] - implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val oafWithIdEncoder: Encoder[(String, Oaf)] = Encoders.tuple(Encoders.STRING, oafEncoder) - implicit val scholixSummaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] - implicit val scholixSummaryEncoderTuple: Encoder[(String,ScholixSummary)] = Encoders.tuple(Encoders.STRING,scholixSummaryEncoder) - - - val pubs = spark.read.load(s"$graphPath/publication").as[Oaf].map(o => (o.asInstanceOf[DLIPublication].getId, o)) - val dats = spark.read.load(s"$graphPath/dataset").as[Oaf].map(o => (o.asInstanceOf[DLIDataset].getId, o)) - val ukn = spark.read.load(s"$graphPath/unknown").as[Oaf].map(o => (o.asInstanceOf[DLIUnknown].getId, o)) - - - val summary:Dataset[(String,ScholixSummary)] = pubs.union(dats).union(ukn).map(o =>{ - val s = ScholixSummary.fromOAF(o._2) - (s.getId,s) - }) - - - val relatedItemInfoDs:Dataset[RelatedItemInfo] = spark.read.load(s"$workingDirPath/relatedItemCount").as[RelatedItemInfo] - - - summary.joinWith(relatedItemInfoDs, summary("_1").equalTo(relatedItemInfoDs("source")), "inner") - .map(i => { - val summary = i._1._2 - val relatedItemInfo = i._2 - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset) - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication) - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown) - summary - }).filter(s => s.getLocalIdentifier != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/summary") - - - - - - - - - - - - - - - - } - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java deleted file mode 100644 index f1eb3992db..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java +++ /dev/null @@ -1,61 +0,0 @@ - -package eu.dnetlib.dhp.provision; - -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; -import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -public class -SparkIndexCollectionOnES { - - public static void main(String[] args) throws Exception { - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkIndexCollectionOnES.class - .getResourceAsStream( - "/eu/dnetlib/dhp/provision/index_on_es.json"))); - parser.parseArgument(args); - - SparkConf conf = new SparkConf() - .setAppName(SparkIndexCollectionOnES.class.getSimpleName()) - .setMaster(parser.get("master")); - - conf.set("spark.sql.shuffle.partitions", "4000"); - - final String sourcePath = parser.get("sourcePath"); - final String index = parser.get("index"); - final String idPath = parser.get("idPath"); - final String cluster = parser.get("cluster"); - final String clusterJson = IOUtils - .toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/cluster.json")); - - final Map clusterMap = new ObjectMapper().readValue(clusterJson, Map.class); - - final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD inputRdd = sc.textFile(sourcePath); - - Map esCfg = new HashMap<>(); - esCfg.put("es.nodes", clusterMap.get(cluster)); - esCfg.put("es.mapping.id", idPath); - esCfg.put("es.batch.write.retry.count", "8"); - esCfg.put("es.batch.write.retry.wait", "60s"); - esCfg.put("es.batch.size.entries", "200"); - esCfg.put("es.nodes.wan.only", "true"); - JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java deleted file mode 100644 index ec3da5cfc9..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java +++ /dev/null @@ -1,286 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix; - -import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.DHPUtils; - -public class Scholix implements Serializable { - private String publicationDate; - - private List publisher; - - private List linkprovider; - - private ScholixRelationship relationship; - - private ScholixResource source; - - private ScholixResource target; - - private String identifier; - - public Scholix clone(final ScholixResource t) { - final Scholix clone = new Scholix(); - clone.setPublicationDate(publicationDate); - clone.setPublisher(publisher); - clone.setLinkprovider(linkprovider); - clone.setRelationship(relationship); - clone.setSource(source); - clone.setTarget(t); - clone.generatelinkPublisher(); - clone.generateIdentifier(); - return clone; - } - - public static Scholix generateScholixWithSource( - final String sourceSummaryJson, final String relation) { - final ObjectMapper mapper = new ObjectMapper(); - - try { - ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class); - Relation rel = mapper.readValue(relation, Relation.class); - final Scholix s = new Scholix(); - if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) - s.setPublicationDate(scholixSummary.getDate().get(0)); - s - .setLinkprovider( - rel - .getCollectedfrom() - .stream() - .map( - cf -> new ScholixEntityId( - cf.getValue(), - Collections - .singletonList( - new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) - .collect(Collectors.toList())); - s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); - s.setSource(ScholixResource.fromSummary(scholixSummary)); - return s; - } catch (Throwable e) { - throw new RuntimeException( - String.format("Summary: %s \n relation:%s", sourceSummaryJson, relation), e); - } - } - - public static Scholix generateScholixWithSource( - final ScholixSummary scholixSummary, final Relation rel) { - final Scholix s = new Scholix(); - if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) - s.setPublicationDate(scholixSummary.getDate().get(0)); - s - .setLinkprovider( - rel - .getCollectedfrom() - .stream() - .map( - cf -> new ScholixEntityId( - cf.getValue(), - Collections - .singletonList( - new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) - .collect(Collectors.toList())); - s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); - s.setSource(ScholixResource.fromSummary(scholixSummary)); - - s.setIdentifier(rel.getTarget()); - return s; - } - - private List mergeScholixEntityId(final List a, final List b) { - final List m = a != null ? new ArrayList<>(a) : new ArrayList<>(); - if (b != null) - b.forEach(s -> { - if (s != null) { - int tt = (int) m - .stream() - .filter(t -> t != null && t.getName() != null && t.getName().equalsIgnoreCase(s.getName())) - .count(); - if (tt == 0) { - m.add(s); - } - } - }); - return m; - } - - private List mergeScholixIdnetifier(final List a, - final List b) { - final List m = a != null ? new ArrayList<>(a) : new ArrayList<>(); - if (b != null) - b.forEach(s -> { - int tt = (int) m.stream().filter(t -> t.getIdentifier().equalsIgnoreCase(s.getIdentifier())).count(); - if (tt == 0) { - m.add(s); - } - }); - return m; - } - - private List mergeScholixCollectedFrom(final List a, - final List b) { - final List m = a != null ? new ArrayList<>(a) : new ArrayList<>(); - if (b != null) - b.forEach(s -> { - int tt = (int) m - .stream() - .filter(t -> t.getProvider().getName().equalsIgnoreCase(s.getProvider().getName())) - .count(); - if (tt == 0) { - m.add(s); - } - }); - return m; - } - - private ScholixRelationship mergeRelationships(final ScholixRelationship a, final ScholixRelationship b) { - ScholixRelationship result = new ScholixRelationship(); - result.setName(a == null || StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName()); - result.setInverse(a == null || StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse()); - result.setSchema(a == null || StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema()); - return result; - } - - private ScholixResource mergeResource(final ScholixResource a, final ScholixResource b) { - if (a == null) - return b; - final ScholixResource result = new ScholixResource(); - result.setCollectedFrom(mergeScholixCollectedFrom(a.getCollectedFrom(), b.getCollectedFrom())); - result.setCreator(mergeScholixEntityId(a.getCreator(), b.getCreator())); - result - .setDnetIdentifier( - StringUtils.isBlank(a.getDnetIdentifier()) ? b.getDnetIdentifier() : a.getDnetIdentifier()); - result.setIdentifier(mergeScholixIdnetifier(a.getIdentifier(), b.getIdentifier())); - result.setObjectType(StringUtils.isNotBlank(a.getObjectType()) ? a.getObjectType() : b.getObjectType()); - result - .setObjectSubType( - StringUtils.isNotBlank(a.getObjectSubType()) ? a.getObjectSubType() : b.getObjectSubType()); - result.setPublisher(mergeScholixEntityId(a.getPublisher(), b.getPublisher())); - result - .setPublicationDate( - StringUtils.isNotBlank(a.getPublicationDate()) ? a.getPublicationDate() : b.getPublicationDate()); - result.setTitle(StringUtils.isNotBlank(a.getTitle()) ? a.getTitle() : b.getTitle()); - return result; - - } - - public void mergeFrom(final Scholix other) { - linkprovider = mergeScholixEntityId(linkprovider, other.getLinkprovider()); - publisher = mergeScholixEntityId(publisher, other.getPublisher()); - if (StringUtils.isEmpty(publicationDate)) - publicationDate = other.getPublicationDate(); - relationship = mergeRelationships(relationship, other.getRelationship()); - source = mergeResource(source, other.getSource()); - target = mergeResource(target, other.getTarget()); - generateIdentifier(); - } - - public void generatelinkPublisher() { - Set publisher = new HashSet<>(); - if (source.getPublisher() != null) - publisher - .addAll( - source - .getPublisher() - .stream() - .map(ScholixEntityId::getName) - .collect(Collectors.toList())); - if (target.getPublisher() != null) - publisher - .addAll( - target - .getPublisher() - .stream() - .map(ScholixEntityId::getName) - .collect(Collectors.toList())); - this.publisher = publisher.stream().map(k -> new ScholixEntityId(k, null)).collect(Collectors.toList()); - } - - public void generateIdentifier() { - setIdentifier( - DHPUtils - .md5( - String - .format( - "%s::%s::%s", - source.getDnetIdentifier(), relationship.getName(), target.getDnetIdentifier()))); - } - - public Scholix addTarget(final String targetSummaryJson) { - final ObjectMapper mapper = new ObjectMapper(); - - try { - ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class); - setTarget(ScholixResource.fromSummary(targetSummary)); - generateIdentifier(); - return this; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - - public String getPublicationDate() { - return publicationDate; - } - - public void setPublicationDate(String publicationDate) { - this.publicationDate = publicationDate; - } - - public List getPublisher() { - return publisher; - } - - public void setPublisher(List publisher) { - this.publisher = publisher; - } - - public List getLinkprovider() { - return linkprovider; - } - - public void setLinkprovider(List linkprovider) { - this.linkprovider = linkprovider; - } - - public ScholixRelationship getRelationship() { - return relationship; - } - - public void setRelationship(ScholixRelationship relationship) { - this.relationship = relationship; - } - - public ScholixResource getSource() { - return source; - } - - public void setSource(ScholixResource source) { - this.source = source; - } - - public ScholixResource getTarget() { - return target; - } - - public void setTarget(ScholixResource target) { - this.target = target; - } - - public String getIdentifier() { - return identifier; - } - - public void setIdentifier(String identifier) { - this.identifier = identifier; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java deleted file mode 100644 index 9ce071fbc2..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java +++ /dev/null @@ -1,45 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix; - -import java.io.Serializable; - -public class ScholixCollectedFrom implements Serializable { - - private ScholixEntityId provider; - private String provisionMode; - private String completionStatus; - - public ScholixCollectedFrom() { - } - - public ScholixCollectedFrom( - ScholixEntityId provider, String provisionMode, String completionStatus) { - this.provider = provider; - this.provisionMode = provisionMode; - this.completionStatus = completionStatus; - } - - public ScholixEntityId getProvider() { - return provider; - } - - public void setProvider(ScholixEntityId provider) { - this.provider = provider; - } - - public String getProvisionMode() { - return provisionMode; - } - - public void setProvisionMode(String provisionMode) { - this.provisionMode = provisionMode; - } - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java deleted file mode 100644 index e797017bc7..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java +++ /dev/null @@ -1,34 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix; - -import java.io.Serializable; -import java.util.List; - -public class ScholixEntityId implements Serializable { - private String name; - private List identifiers; - - public ScholixEntityId() { - } - - public ScholixEntityId(String name, List identifiers) { - this.name = name; - this.identifiers = identifiers; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public List getIdentifiers() { - return identifiers; - } - - public void setIdentifiers(List identifiers) { - this.identifiers = identifiers; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java deleted file mode 100644 index 0dd15336a4..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java +++ /dev/null @@ -1,33 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix; - -import java.io.Serializable; - -public class ScholixIdentifier implements Serializable { - private String identifier; - private String schema; - - public ScholixIdentifier() { - } - - public ScholixIdentifier(String identifier, String schema) { - this.identifier = identifier; - this.schema = schema; - } - - public String getIdentifier() { - return identifier; - } - - public void setIdentifier(String identifier) { - this.identifier = identifier; - } - - public String getSchema() { - return schema; - } - - public void setSchema(String schema) { - this.schema = schema; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java deleted file mode 100644 index 0cbdf43e79..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java +++ /dev/null @@ -1,43 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix; - -import java.io.Serializable; - -public class ScholixRelationship implements Serializable { - private String name; - private String schema; - private String inverse; - - public ScholixRelationship() { - } - - public ScholixRelationship(String name, String schema, String inverse) { - this.name = name; - this.schema = schema; - this.inverse = inverse; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getSchema() { - return schema; - } - - public void setSchema(String schema) { - this.schema = schema; - } - - public String getInverse() { - return inverse; - } - - public void setInverse(String inverse) { - this.inverse = inverse; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java deleted file mode 100644 index 6de30c7481..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java +++ /dev/null @@ -1,151 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix; - -import java.io.Serializable; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; - -public class ScholixResource implements Serializable { - - private List identifier; - private String dnetIdentifier; - private String objectType; - private String objectSubType; - private String title; - private List creator; - private String publicationDate; - private List publisher; - private List collectedFrom; - - public static ScholixResource fromSummary(ScholixSummary summary) { - - final ScholixResource resource = new ScholixResource(); - - resource.setDnetIdentifier(summary.getId()); - - resource - .setIdentifier( - summary - .getLocalIdentifier() - .stream() - .map(i -> new ScholixIdentifier(i.getId(), i.getType())) - .collect(Collectors.toList())); - - resource.setObjectType(summary.getTypology().toString()); - - if (summary.getTitle() != null && summary.getTitle().size() > 0) - resource.setTitle(summary.getTitle().get(0)); - - if (summary.getAuthor() != null) - resource - .setCreator( - summary - .getAuthor() - .stream() - .map(c -> new ScholixEntityId(c, null)) - .collect(Collectors.toList())); - - if (summary.getDate() != null && summary.getDate().size() > 0) - resource.setPublicationDate(summary.getDate().get(0)); - if (summary.getPublisher() != null) - resource - .setPublisher( - summary - .getPublisher() - .stream() - .map(p -> new ScholixEntityId(p, null)) - .collect(Collectors.toList())); - if (summary.getDatasources() != null) - resource - .setCollectedFrom( - summary - .getDatasources() - .stream() - .map( - d -> new ScholixCollectedFrom( - new ScholixEntityId( - d.getDatasourceName(), - Collections - .singletonList( - new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))), - "collected", - d.getCompletionStatus())) - .collect(Collectors.toList())); - return resource; - } - - public List getIdentifier() { - return identifier; - } - - public void setIdentifier(List identifier) { - this.identifier = identifier; - } - - public String getDnetIdentifier() { - return dnetIdentifier; - } - - public void setDnetIdentifier(String dnetIdentifier) { - this.dnetIdentifier = dnetIdentifier; - } - - public String getObjectType() { - return objectType; - } - - public void setObjectType(String objectType) { - this.objectType = objectType; - } - - public String getObjectSubType() { - return objectSubType; - } - - public void setObjectSubType(String objectSubType) { - this.objectSubType = objectSubType; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public List getCreator() { - return creator; - } - - public void setCreator(List creator) { - this.creator = creator; - } - - public String getPublicationDate() { - return publicationDate; - } - - public void setPublicationDate(String publicationDate) { - this.publicationDate = publicationDate; - } - - public List getPublisher() { - return publisher; - } - - public void setPublisher(List publisher) { - this.publisher = publisher; - } - - public List getCollectedFrom() { - return collectedFrom; - } - - public void setCollectedFrom(List collectedFrom) { - this.collectedFrom = collectedFrom; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java deleted file mode 100644 index 6d6f46f544..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java +++ /dev/null @@ -1,44 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix.summary; - -import java.io.Serializable; - -public class CollectedFromType implements Serializable { - - private String datasourceName; - private String datasourceId; - private String completionStatus; - - public CollectedFromType() { - } - - public CollectedFromType(String datasourceName, String datasourceId, String completionStatus) { - this.datasourceName = datasourceName; - this.datasourceId = datasourceId; - this.completionStatus = completionStatus; - } - - public String getDatasourceName() { - return datasourceName; - } - - public void setDatasourceName(String datasourceName) { - this.datasourceName = datasourceName; - } - - public String getDatasourceId() { - return datasourceId; - } - - public void setDatasourceId(String datasourceId) { - this.datasourceId = datasourceId; - } - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java deleted file mode 100644 index e9d94fccf9..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java +++ /dev/null @@ -1,33 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix.summary; - -import java.io.Serializable; - -public class SchemeValue implements Serializable { - private String scheme; - private String value; - - public SchemeValue() { - } - - public SchemeValue(String scheme, String value) { - this.scheme = scheme; - this.value = value; - } - - public String getScheme() { - return scheme; - } - - public void setScheme(String scheme) { - this.scheme = scheme; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java deleted file mode 100644 index 3b808ba519..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java +++ /dev/null @@ -1,321 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix.summary; - -import java.io.Serializable; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - -import com.fasterxml.jackson.annotation.JsonProperty; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.provision.RelatedItemInfo; -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; -import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; -import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; - -public class ScholixSummary implements Serializable { - private String id; - private List localIdentifier; - private Typology typology; - private List title; - private List author; - private List date; - private String description; - private List subject; - private List publisher; - private long relatedPublications; - private long relatedDatasets; - private long relatedUnknown; - private List datasources; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public List getLocalIdentifier() { - return localIdentifier; - } - - public void setLocalIdentifier(List localIdentifier) { - this.localIdentifier = localIdentifier; - } - - public Typology getTypology() { - return typology; - } - - public void setTypology(Typology typology) { - this.typology = typology; - } - - public List getTitle() { - return title; - } - - public void setTitle(List title) { - this.title = title; - } - - public List getAuthor() { - return author; - } - - public void setAuthor(List author) { - this.author = author; - } - - public List getDate() { - return date; - } - - public void setDate(List date) { - this.date = date; - } - - @JsonProperty("abstract") - public String getDescription() { - return description; - } - - @JsonProperty("abstract") - public void setDescription(String description) { - this.description = description; - } - - public List getSubject() { - return subject; - } - - public void setSubject(List subject) { - this.subject = subject; - } - - public List getPublisher() { - return publisher; - } - - public void setPublisher(List publisher) { - this.publisher = publisher; - } - - public long getRelatedPublications() { - return relatedPublications; - } - - public void setRelatedPublications(long relatedPublications) { - this.relatedPublications = relatedPublications; - } - - public long getRelatedDatasets() { - return relatedDatasets; - } - - public void setRelatedDatasets(long relatedDatasets) { - this.relatedDatasets = relatedDatasets; - } - - public long getRelatedUnknown() { - return relatedUnknown; - } - - public void setRelatedUnknown(long relatedUnknown) { - this.relatedUnknown = relatedUnknown; - } - - public List getDatasources() { - return datasources; - } - - public void setDatasources(List datasources) { - this.datasources = datasources; - } - - public static ScholixSummary fromOAF(final Oaf oaf) { - try { - final RelatedItemInfo relatedItemInfo = new RelatedItemInfo(); - - if (oaf instanceof DLIPublication) - return summaryFromPublication((DLIPublication) oaf, relatedItemInfo); - if (oaf instanceof DLIDataset) - return summaryFromDataset((DLIDataset) oaf, relatedItemInfo); - if (oaf instanceof DLIUnknown) - return summaryFromUnknown((DLIUnknown) oaf, relatedItemInfo); - - } catch (Throwable e) { - throw new RuntimeException(e); - } - return null; - } - - private static ScholixSummary summaryFromDataset( - final DLIDataset item, final RelatedItemInfo relatedItemInfo) { - ScholixSummary summary = new ScholixSummary(); - summary.setId(item.getId()); - - if (item.getPid() != null) - summary - .setLocalIdentifier( - item - .getPid() - .stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList())); - - summary.setTypology(Typology.dataset); - if (item.getTitle() != null) - summary - .setTitle( - item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); - - if (item.getAuthor() != null) { - summary - .setAuthor( - item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); - } - - if (item.getRelevantdate() != null) - summary - .setDate( - item - .getRelevantdate() - .stream() - .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) - .map(StructuredProperty::getValue) - .collect(Collectors.toList())); - - if (item.getDescription() != null && item.getDescription().size() > 0) - summary.setDescription(item.getDescription().get(0).getValue()); - - if (item.getSubject() != null) { - summary - .setSubject( - item - .getSubject() - .stream() - .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) - .collect(Collectors.toList())); - } - if (item.getPublisher() != null) - summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); - - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - - if (item.getDlicollectedfrom() != null) - summary - .setDatasources( - item - .getDlicollectedfrom() - .stream() - .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) - .collect(Collectors.toList())); - return summary; - } - - private static ScholixSummary summaryFromPublication( - final DLIPublication item, final RelatedItemInfo relatedItemInfo) { - ScholixSummary summary = new ScholixSummary(); - summary.setId(item.getId()); - - if (item.getPid() != null) - summary - .setLocalIdentifier( - item - .getPid() - .stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList())); - - summary.setTypology(Typology.publication); - if (item.getTitle() != null) - summary - .setTitle( - item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); - - if (item.getAuthor() != null) { - summary - .setAuthor( - item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); - } - - if (item.getRelevantdate() != null) - summary - .setDate( - item - .getRelevantdate() - .stream() - .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) - .map(StructuredProperty::getValue) - .collect(Collectors.toList())); - - if (item.getDescription() != null && item.getDescription().size() > 0) - summary.setDescription(item.getDescription().get(0).getValue()); - - if (item.getSubject() != null) { - summary - .setSubject( - item - .getSubject() - .stream() - .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) - .collect(Collectors.toList())); - } - - if (item.getPublisher() != null) - summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); - - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - - if (item.getDlicollectedfrom() != null) - summary - .setDatasources( - item - .getDlicollectedfrom() - .stream() - .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) - .collect(Collectors.toList())); - - return summary; - } - - private static ScholixSummary summaryFromUnknown( - final DLIUnknown item, final RelatedItemInfo relatedItemInfo) { - ScholixSummary summary = new ScholixSummary(); - summary.setId(item.getId()); - if (item.getPid() != null) - summary - .setLocalIdentifier( - item - .getPid() - .stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList())); - - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - summary.setTypology(Typology.unknown); - if (item.getDlicollectedfrom() != null) - summary - .setDatasources( - item - .getDlicollectedfrom() - .stream() - .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) - .collect(Collectors.toList())); - return summary; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java deleted file mode 100644 index c4148ad242..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java +++ /dev/null @@ -1,33 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix.summary; - -import java.io.Serializable; - -public class TypedIdentifier implements Serializable { - private String id; - private String type; - - public TypedIdentifier() { - } - - public TypedIdentifier(String id, String type) { - this.id = id; - this.type = type; - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java deleted file mode 100644 index effa32b6bc..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java +++ /dev/null @@ -1,8 +0,0 @@ - -package eu.dnetlib.dhp.provision.scholix.summary; - -import java.io.Serializable; - -public enum Typology implements Serializable { - dataset, publication, unknown -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java deleted file mode 100644 index a172ef698c..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java +++ /dev/null @@ -1,131 +0,0 @@ - -package eu.dnetlib.dhp.provision.update; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; - -import eu.dnetlib.dhp.provision.scholix.ScholixCollectedFrom; -import eu.dnetlib.dhp.provision.scholix.ScholixEntityId; -import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; -import eu.dnetlib.dhp.utils.DHPUtils; - -public class CrossRefParserJSON { - - private static final List collectedFrom = generateCrossrefCollectedFrom("complete"); - - public static ScholixResource parseRecord(final String record) { - if (record == null) - return null; - JsonElement jElement = new JsonParser().parse(record); - JsonElement source = null; - if (jElement.getAsJsonObject().has("_source")) { - source = jElement.getAsJsonObject().get("_source"); - if (source == null || !source.isJsonObject()) - return null; - } else if (jElement.getAsJsonObject().has("DOI")) { - source = jElement; - } else { - return null; - } - - final JsonObject message = source.getAsJsonObject(); - ScholixResource currentObject = new ScholixResource(); - - if (message.get("DOI") != null) { - final String doi = message.get("DOI").getAsString(); - currentObject.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); - } - - if ((!message.get("created").isJsonNull()) - && (message.getAsJsonObject("created").get("date-time") != null)) { - currentObject - .setPublicationDate( - message.getAsJsonObject("created").get("date-time").getAsString()); - } - - if (message.get("title") != null - && !message.get("title").isJsonNull() - && message.get("title").isJsonArray()) { - - JsonArray array = message.get("title").getAsJsonArray(); - currentObject.setTitle(array.get(0).getAsString()); - } - if (message.get("author") != null && !message.get("author").isJsonNull()) { - JsonArray author = message.getAsJsonArray("author"); - List authorList = new ArrayList<>(); - for (JsonElement anAuthor : author) { - JsonObject currentAuth = anAuthor.getAsJsonObject(); - - String family = ""; - String given = ""; - if (currentAuth != null - && currentAuth.get("family") != null - && !currentAuth.get("family").isJsonNull()) { - family = currentAuth.get("family").getAsString(); - } - if (currentAuth != null - && currentAuth.get("given") != null - && !currentAuth.get("given").isJsonNull()) { - given = currentAuth.get("given").getAsString(); - } - authorList.add(new ScholixEntityId(String.format("%s %s", family, given), null)); - } - currentObject.setCreator(authorList); - } - if (message.get("publisher") != null && !message.get("publisher").isJsonNull()) { - currentObject - .setPublisher( - Collections - .singletonList( - new ScholixEntityId(message.get("publisher").getAsString(), null))); - } - currentObject.setCollectedFrom(collectedFrom); - currentObject.setObjectType("publication"); - currentObject - .setDnetIdentifier( - generateId(message.get("DOI").getAsString(), "doi", "publication")); - - return currentObject; - } - - private static List generateCrossrefCollectedFrom( - final String completionStatus) { - final ScholixEntityId scholixEntityId = new ScholixEntityId( - "Crossref", - Collections - .singletonList( - new ScholixIdentifier("dli_________::crossref", "dnet_identifier"))); - return Collections - .singletonList( - new ScholixCollectedFrom(scholixEntityId, "resolved", completionStatus)); - } - - private static String generateId( - final String pid, final String pidType, final String entityType) { - String type; - switch (entityType) { - case "publication": - type = "50|"; - break; - case "dataset": - type = "60|"; - break; - case "unknown": - type = "70|"; - break; - default: - throw new IllegalArgumentException("unexpected value " + entityType); - } - return type - + DHPUtils - .md5( - String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java deleted file mode 100644 index 9ace7b37a9..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java +++ /dev/null @@ -1,90 +0,0 @@ - -package eu.dnetlib.dhp.provision.update; - -import java.io.ByteArrayOutputStream; -import java.util.zip.Inflater; - -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.io.IOUtils; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; - -import com.google.gson.JsonElement; -import com.google.gson.JsonParser; - -import eu.dnetlib.dhp.provision.scholix.ScholixResource; - -public class CrossrefClient { - - private String host; - private String index = "crossref"; - private String indexType = "item"; - - public CrossrefClient(String host) { - this.host = host; - } - - public String getHost() { - return host; - } - - public void setHost(String host) { - this.host = host; - } - - public String getIndex() { - return index; - } - - public void setIndex(String index) { - this.index = index; - } - - public String getIndexType() { - return indexType; - } - - public void setIndexType(String indexType) { - this.indexType = indexType; - } - - private static String decompressBlob(final String blob) { - try { - byte[] byteArray = Base64.decodeBase64(blob.getBytes()); - final Inflater decompresser = new Inflater(); - decompresser.setInput(byteArray); - final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length); - byte[] buffer = new byte[8192]; - while (!decompresser.finished()) { - int size = decompresser.inflate(buffer); - bos.write(buffer, 0, size); - } - decompresser.end(); - return bos.toString(); - } catch (Throwable e) { - throw new RuntimeException("Wrong record:" + blob, e); - } - } - - public ScholixResource getResourceByDOI(final String doi) { - try (CloseableHttpClient client = HttpClients.createDefault()) { - HttpGet httpGet = new HttpGet( - String - .format( - "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); - CloseableHttpResponse response = client.execute(httpGet); - String json = IOUtils.toString(response.getEntity().getContent()); - if (json.contains("blob")) { - JsonParser p = new JsonParser(); - final JsonElement root = p.parse(json); - json = decompressBlob( - root.getAsJsonObject().get("_source").getAsJsonObject().get("blob").getAsString()); - } - return CrossRefParserJSON.parseRecord(json); - } catch (Throwable e) { - return null; - } - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java deleted file mode 100644 index 10426b29c8..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java +++ /dev/null @@ -1,229 +0,0 @@ - -package eu.dnetlib.dhp.provision.update; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import com.jayway.jsonpath.JsonPath; - -import eu.dnetlib.dhp.provision.scholix.*; -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.scholexplorer.relation.RelInfo; -import eu.dnetlib.scholexplorer.relation.RelationMapper; - -public class Datacite2Scholix { - - private String rootPath = "$.attributes"; - final RelationMapper relationMapper; - - public Datacite2Scholix(RelationMapper relationMapper) { - this.relationMapper = relationMapper; - } - - public List generateScholixFromJson(final String dJson) { - List> relIds = getRelatedIendtifiers(dJson); - relIds = relIds != null - ? relIds - .stream() - .filter( - m -> m.containsKey("relatedIdentifierType") - && m.containsKey("relationType") - && m.containsKey("relatedIdentifier")) - .collect(Collectors.toList()) - : null; - if (relIds == null || relIds.size() == 0) - return null; - - final String updated = JsonPath.read(dJson, rootPath + ".updated"); - ScholixResource resource = generateDataciteScholixResource(dJson); - - return relIds - .stream() - .flatMap( - s -> { - try { - final List result = generateScholix( - resource, - "" + s.get("relatedIdentifier"), - s.get("relatedIdentifierType"), - s.get("relationType"), - updated); - return result.stream(); - } catch (Throwable e) { - return new ArrayList().stream(); - } - }) - .collect(Collectors.toList()); - } - - public String getRootPath() { - return rootPath; - } - - public void setRootPath(String rootPath) { - this.rootPath = rootPath; - } - - private List generateScholix( - ScholixResource source, - final String pid, - final String pidtype, - final String relType, - final String updated) { - - if ("doi".equalsIgnoreCase(pidtype)) { - ScholixResource target = new ScholixResource(); - target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); - final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); - final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", - relInfo.getInverse()); - final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); - final Scholix s = new Scholix(); - s.setSource(source); - s.setTarget(target); - s.setLinkprovider(Collections.singletonList(provider)); - s.setPublisher(source.getPublisher()); - s.setRelationship(rel); - s.setPublicationDate(updated); - return Collections.singletonList(s); - } else { - final List result = new ArrayList<>(); - ScholixResource target = new ScholixResource(); - target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); - target.setDnetIdentifier(generateId(pid, pidtype, "unknown")); - target.setObjectType("unknown"); - target.setCollectedFrom(generateDataciteCollectedFrom("incomplete")); - final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); - final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", - relInfo.getInverse()); - final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); - final Scholix s = new Scholix(); - s.setSource(source); - s.setTarget(target); - s.setLinkprovider(Collections.singletonList(provider)); - s.setPublisher(source.getPublisher()); - s.setRelationship(rel); - s.setPublicationDate(updated); - s.generateIdentifier(); - result.add(s); - final Scholix s2 = new Scholix(); - s2.setSource(target); - s2.setTarget(source); - s2.setLinkprovider(Collections.singletonList(provider)); - s2.setPublisher(source.getPublisher()); - s2 - .setRelationship( - new ScholixRelationship(relInfo.getInverse(), "datacite", relInfo.getOriginal())); - s2.setPublicationDate(updated); - s2.generateIdentifier(); - result.add(s2); - return result; - } - } - - public ScholixResource generateDataciteScholixResource(String dJson) { - ScholixResource resource = new ScholixResource(); - String DOI_PATH = rootPath + ".doi"; - final String doi = JsonPath.read(dJson, DOI_PATH); - resource.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); - resource.setObjectType(getType(dJson)); - resource.setDnetIdentifier(generateId(doi, "doi", resource.getObjectType())); - resource.setCollectedFrom(generateDataciteCollectedFrom("complete")); - final String publisher = JsonPath.read(dJson, rootPath + ".publisher"); - if (StringUtils.isNotBlank(publisher)) - resource.setPublisher(Collections.singletonList(new ScholixEntityId(publisher, null))); - final String date = getDate(dJson); - if (StringUtils.isNotBlank(date)) - resource.setPublicationDate(date); - final String title = getTitle(dJson); - if (StringUtils.isNotBlank(title)) - resource.setTitle(title); - resource.setCreator(getCreators(dJson)); - return resource; - } - - private List getCreators(final String json) { - final List creatorName = JsonPath.read(json, rootPath + ".creators[*].name"); - if (creatorName != null && creatorName.size() > 0) { - return creatorName - .stream() - .map(s -> new ScholixEntityId(s, null)) - .collect(Collectors.toList()); - } - return null; - } - - private String getTitle(final String json) { - final List titles = JsonPath.read(json, rootPath + ".titles[*].title"); - return titles != null && titles.size() > 0 ? titles.get(0) : null; - } - - private String getDate(final String json) { - final List> dates = JsonPath.read(json, rootPath + ".dates"); - if (dates != null && dates.size() > 0) { - - List> issued = dates - .stream() - .filter(s -> "issued".equalsIgnoreCase(s.get("dateType"))) - .collect(Collectors.toList()); - if (issued.size() > 0) - return issued.get(0).get("date"); - } - return null; - } - - private List generateDataciteCollectedFrom(final String completionStatus) { - final ScholixEntityId scholixEntityId = new ScholixEntityId( - "Datasets in Datacite", - Collections - .singletonList( - new ScholixIdentifier("dli_________::datacite", "dnet_identifier"))); - return Collections - .singletonList( - new ScholixCollectedFrom(scholixEntityId, "collected", completionStatus)); - } - - private String getType(final String json) { - try { - final String bibtext = JsonPath.read(json, rootPath + ".types.bibtex"); - if ("article".equalsIgnoreCase(bibtext)) { - return "publication"; - } - return "dataset"; - } catch (Throwable e) { - return "dataset"; - } - } - - private List> getRelatedIendtifiers(final String json) { - String REL_IDENTIFIER_PATH = rootPath + ".relatedIdentifiers[*]"; - List> res = JsonPath.read(json, REL_IDENTIFIER_PATH); - return res; - } - - public static String generateId(final String pid, final String pidType, final String entityType) { - String type; - switch (entityType) { - case "publication": - type = "50|"; - break; - case "dataset": - type = "60|"; - break; - case "unknown": - type = "70|"; - break; - default: - throw new IllegalArgumentException("unexpected value " + entityType); - } - return type - + DHPUtils - .md5( - String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java deleted file mode 100644 index 9e9f0d5c99..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java +++ /dev/null @@ -1,75 +0,0 @@ - -package eu.dnetlib.dhp.provision.update; - -import java.io.IOException; - -import org.apache.commons.io.IOUtils; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; - -import eu.dnetlib.dhp.provision.scholix.ScholixResource; - -public class DataciteClient { - - private String host; - private String index = "datacite"; - private String indexType = "dump"; - private final Datacite2Scholix d2s; - - public DataciteClient(String host) { - this.host = host; - - d2s = new Datacite2Scholix(null); - d2s.setRootPath("$._source.attributes"); - } - - public Iterable getDatasetsFromTs(final Long timestamp) { - return () -> { - try { - return new DataciteClientIterator(host, index, timestamp); - } catch (IOException e) { - throw new RuntimeException(e); - } - }; - } - - public String getHost() { - return host; - } - - public void setHost(String host) { - this.host = host; - } - - public String getIndex() { - return index; - } - - public void setIndex(String index) { - this.index = index; - } - - public String getIndexType() { - return indexType; - } - - public void setIndexType(String indexType) { - this.indexType = indexType; - } - - public ScholixResource getDatasetByDOI(final String doi) { - try (CloseableHttpClient client = HttpClients.createDefault()) { - HttpGet httpGet = new HttpGet( - String - .format( - "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); - CloseableHttpResponse response = client.execute(httpGet); - final String json = IOUtils.toString(response.getEntity().getContent()); - return d2s.generateDataciteScholixResource(json); - } catch (Throwable e) { - return null; - } - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java deleted file mode 100644 index 2c70c8b091..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java +++ /dev/null @@ -1,120 +0,0 @@ - -package eu.dnetlib.dhp.provision.update; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.commons.io.IOUtils; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpPost; -import org.apache.http.entity.StringEntity; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.JsonPath; - -import net.minidev.json.JSONArray; - -public class DataciteClientIterator implements Iterator { - - static final String blobPath = "$.hits.hits[*]._source"; - static final String scrollIdPath = "$._scroll_id"; - - String scrollId; - - List buffer; - - final String esHost; - final String esIndex; - final ObjectMapper mapper = new ObjectMapper(); - - public DataciteClientIterator(final String esHost, final String esIndex, long timestamp) - throws IOException { - - this.esHost = esHost; - this.esIndex = esIndex; - // THIS FIX IS NECESSARY to avoid different timezone - timestamp -= (60 * 60 * 2); - final String body = getResponse( - String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), - String - .format( - "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", timestamp)); - scrollId = getJPathString(scrollIdPath, body); - buffer = getBlobs(body); - } - - public String getResponse(final String url, final String json) { - CloseableHttpClient client = HttpClients.createDefault(); - try { - - HttpPost httpPost = new HttpPost(url); - if (json != null) { - StringEntity entity = new StringEntity(json); - httpPost.setEntity(entity); - httpPost.setHeader("Accept", "application/json"); - httpPost.setHeader("Content-type", "application/json"); - } - CloseableHttpResponse response = client.execute(httpPost); - - return IOUtils.toString(response.getEntity().getContent()); - } catch (Throwable e) { - throw new RuntimeException("Error on executing request ", e); - } finally { - try { - client.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close client ", e); - } - } - } - - private String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) - return (String) o; - return null; - } catch (Exception e) { - return ""; - } - } - - private List getBlobs(final String body) { - JSONArray array = JsonPath.read(body, blobPath); - return array - .stream() - .map( - o -> { - try { - return mapper.writeValueAsString(o); - } catch (Throwable e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toList()); - } - - @Override - public boolean hasNext() { - return (buffer != null && !buffer.isEmpty()); - } - - @Override - public String next() { - final String nextItem = buffer.remove(0); - if (buffer.isEmpty()) { - final String json_param = String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId); - final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); - try { - buffer = getBlobs(body); - } catch (Throwable e) { - System.out.println(body); - } - } - return nextItem; - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java deleted file mode 100644 index e876d05a12..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java +++ /dev/null @@ -1,72 +0,0 @@ - -package eu.dnetlib.dhp.provision.update; - -import java.net.URI; -import java.util.List; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.scholexplorer.relation.RelationMapper; - -public class RetrieveUpdateFromDatacite { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - RetrieveUpdateFromDatacite.class - .getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json"))); - parser.parseArgument(args); - final String hdfsuri = parser.get("namenode"); - Path hdfswritepath = new Path(parser.get("targetPath")); - final long timestamp = Long.parseLong(parser.get("timestamp")); - final String host = parser.get("indexHost"); - final String index = parser.get("indexName"); - - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - - FileSystem.get(URI.create(hdfsuri), conf); - final Datacite2Scholix d2s = new Datacite2Scholix(RelationMapper.load()); - final ObjectMapper mapper = new ObjectMapper(); - try (SequenceFile.Writer writer = SequenceFile - .createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final Text value = new Text(); - final IntWritable key = new IntWritable(); - int i = 0; - for (String dataset : new DataciteClient(host).getDatasetsFromTs(timestamp)) { - i++; - List scholix = d2s.generateScholixFromJson(dataset); - if (scholix != null) - for (Scholix s : scholix) { - key.set(i); - value.set(mapper.writeValueAsString(s)); - writer.append(key, value); - if (i % 10000 == 0) { - System.out.println("wrote " + i); - } - } - } - } - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java deleted file mode 100644 index 981c471aea..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java +++ /dev/null @@ -1,184 +0,0 @@ - -package eu.dnetlib.dhp.provision.update; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.*; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; -import eu.dnetlib.dhp.provision.scholix.ScholixRelationship; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; -import eu.dnetlib.dhp.utils.DHPUtils; -import scala.Tuple2; - -public class SparkResolveScholixTarget { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkResolveScholixTarget.class - .getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json"))); - parser.parseArgument(args); - - final SparkConf conf = new SparkConf(); - - final String master = parser.get("master"); - final String sourcePath = parser.get("sourcePath"); - final String workingDirPath = parser.get("workingDirPath"); - final String indexHost = parser.get("indexHost"); - try (SparkSession spark = getSession(conf, master)) { - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - spark - .createDataset( - sc - .sequenceFile(sourcePath, IntWritable.class, Text.class) - .map(Tuple2::_2) - .map(s -> new ObjectMapper().readValue(s.toString(), Scholix.class)) - .rdd(), - Encoders.bean(Scholix.class)) - .write() - .save(workingDirPath + "/stepA"); - - Dataset s1 = spark.read().load(workingDirPath + "/stepA").as(Encoders.bean(Scholix.class)); - - s1 - .where(s1.col("target.dnetIdentifier").isNull()) - .select(s1.col("target.identifier")) - .distinct() - .map( - (MapFunction) f -> { - final String pid = ((Row) f.getList(0).get(0)).getString(0); - ScholixResource publication = new CrossrefClient(indexHost).getResourceByDOI(pid); - if (publication != null) { - return publication; - } - ScholixResource dataset = new DataciteClient(indexHost).getDatasetByDOI(pid); - if (dataset != null) { - return dataset; - } - ScholixResource r = new ScholixResource(); - r.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, "doi"))); - r.setObjectType("unknown"); - r - .setDnetIdentifier( - "70|" + DHPUtils.md5(String.format("%s::doi", pid.toLowerCase().trim()))); - - return r; - }, - Encoders.bean(ScholixResource.class)) - .write() - .mode(SaveMode.Overwrite) - .save(workingDirPath + "/stepB"); - - Dataset s2 = spark - .read() - .load(workingDirPath + "/stepB") - .as(Encoders.bean(ScholixResource.class)); - - s1 - .joinWith( - s2, - s1.col("target.identifier.identifier").equalTo(s2.col("identifier.identifier")), - "left") - .flatMap( - (FlatMapFunction, Scholix>) f -> { - final List res = new ArrayList<>(); - final Scholix s = f._1(); - final ScholixResource target = f._2(); - if (StringUtils.isNotBlank(s.getIdentifier())) - res.add(s); - else if (target == null) { - ScholixResource currentTarget = s.getTarget(); - currentTarget.setObjectType("unknown"); - currentTarget - .setDnetIdentifier( - Datacite2Scholix - .generateId( - currentTarget.getIdentifier().get(0).getIdentifier(), - currentTarget.getIdentifier().get(0).getSchema(), - currentTarget.getObjectType())); - - s.generateIdentifier(); - res.add(s); - final Scholix inverse = new Scholix(); - inverse.setTarget(s.getSource()); - inverse.setSource(s.getTarget()); - inverse.setLinkprovider(s.getLinkprovider()); - inverse.setPublicationDate(s.getPublicationDate()); - inverse.setPublisher(s.getPublisher()); - inverse - .setRelationship( - new ScholixRelationship( - s.getRelationship().getInverse(), - s.getRelationship().getSchema(), - s.getRelationship().getName())); - inverse.generateIdentifier(); - res.add(inverse); - - } else { - target - .setIdentifier( - target - .getIdentifier() - .stream() - .map( - d -> new ScholixIdentifier( - d.getIdentifier().toLowerCase(), - d.getSchema().toLowerCase())) - .collect(Collectors.toList())); - s.setTarget(target); - s.generateIdentifier(); - res.add(s); - final Scholix inverse = new Scholix(); - inverse.setTarget(s.getSource()); - inverse.setSource(s.getTarget()); - inverse.setLinkprovider(s.getLinkprovider()); - inverse.setPublicationDate(s.getPublicationDate()); - inverse.setPublisher(s.getPublisher()); - inverse - .setRelationship( - new ScholixRelationship( - s.getRelationship().getInverse(), - s.getRelationship().getSchema(), - s.getRelationship().getName())); - inverse.generateIdentifier(); - res.add(inverse); - } - - return res.iterator(); - }, - Encoders.bean(Scholix.class)) - .javaRDD() - .map(s -> new ObjectMapper().writeValueAsString(s)) - .saveAsTextFile(workingDirPath + "/resolved_json"); - } - } - - private static SparkSession getSession(SparkConf conf, String master) { - return SparkSession - .builder() - .config(conf) - .appName(SparkResolveScholixTarget.class.getSimpleName()) - .master(master) - .getOrCreate(); - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_export_content_parameters.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_export_content_parameters.json deleted file mode 100644 index b92f87e084..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_export_content_parameters.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "w", - "paramLongName": "workingDirPath", - "paramDescription": "the working path where generated files", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json deleted file mode 100644 index 6d90ced2cb..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "paramName": "n", - "paramLongName": "nameNode", - "paramDescription": "the Name Node", - "paramRequired": true - }, - { - "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the source path", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "targetPath", - "paramDescription": "the target path", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/upload_zenodo.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/upload_zenodo.json deleted file mode 100644 index 66676005e9..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/upload_zenodo.json +++ /dev/null @@ -1,45 +0,0 @@ - -[ - { - "paramName":"nd", - "paramLongName":"newDeposition", - "paramDescription": "if it is a new deposition (true) or a new version (false)", - "paramRequired": true - }, - { - "paramName":"cri", - "paramLongName":"conceptRecordId", - "paramDescription": "The id of the concept record for a new version", - "paramRequired": false - }, - { - "paramName":"hdfsp", - "paramLongName":"hdfsPath", - "paramDescription": "the path of the folder tofind files to send to Zenodo", - "paramRequired": true - }, - { - "paramName": "nn", - "paramLongName": "nameNode", - "paramDescription": "the name node", - "paramRequired": true - }, - { - "paramName": "at", - "paramLongName": "accessToken", - "paramDescription": "the access token for the deposition", - "paramRequired": false - }, - { - "paramName":"cu", - "paramLongName":"connectionUrl", - "paramDescription": "the url to connect to deposit", - "paramRequired": false - }, - { - "paramName":"m", - "paramLongName":"metadata", - "paramDescription": "metadata associated to the deposition", - "paramRequired": false - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/cluster.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/cluster.json deleted file mode 100644 index 1cea6a8b92..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/cluster.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "cluster1": "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54", - "cluster2": "10.19.65.55, 10.19.65.56, 10.19.65.57, 10.19.65.58" -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/dataset2Json.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/dataset2Json.json deleted file mode 100644 index 41db00cbf3..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/dataset2Json.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "paramName": "m", - "paramLongName": "master", - "paramDescription": "master should be local or yarn", - "paramRequired": true - }, - { - "paramName": "w", - "paramLongName": "workingPath", - "paramDescription": "the working path", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/dropAndCreateIndex.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/dropAndCreateIndex.json deleted file mode 100644 index 242aca8c5e..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/dropAndCreateIndex.json +++ /dev/null @@ -1,14 +0,0 @@ -[ -{ - "paramName": "c", - "paramLongName": "cluster", - "paramDescription": "should be cluster1 or cluster2", - "paramRequired": true -}, - { - "paramName": "i", - "paramLongName": "index", - "paramDescription": "index name", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/index_on_es.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/index_on_es.json deleted file mode 100644 index 51b001a0de..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/index_on_es.json +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the working path where generated files", - "paramRequired": true - }, - { - "paramName": "i", - "paramLongName": "index", - "paramDescription": "the index name", - "paramRequired": true - }, - { - "paramName": "c", - "paramLongName": "cluster", - "paramDescription": "the index cluster", - "paramRequired": true - }, - - { - "paramName": "id", - "paramLongName": "idPath", - "paramDescription": "the identifier field name", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json deleted file mode 100644 index 37fbffb9b6..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "w", - "paramLongName": "workingDirPath", - "paramDescription": "the working path where generated files", - "paramRequired": true - }, - { - "paramName": "g", - "paramLongName": "graphPath", - "paramDescription": "the relationPath path ", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_related_entities_parameters.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_related_entities_parameters.json deleted file mode 100644 index 4106ab352f..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_related_entities_parameters.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "w", - "paramLongName": "workingDirPath", - "paramDescription": "the working path where generated files", - "paramRequired": true - }, - { - "paramName": "r", - "paramLongName": "relationPath", - "paramDescription": "the relationPath path ", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json deleted file mode 100644 index e4b6b9dfda..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "paramName": "m", - "paramLongName": "master", - "paramDescription": "the name node", - "paramRequired": true - }, - { - "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the source path", - "paramRequired": true - }, - { - "paramName": "w", - "paramLongName": "workingDirPath", - "paramDescription": "the working Dir Path", - "paramRequired": true - }, - { - "paramName": "h", - "paramLongName": "indexHost", - "paramDescription": "the working Dir Path", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json deleted file mode 100644 index 5c11aca8d1..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "paramName": "n", - "paramLongName": "namenode", - "paramDescription": "the name node", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "targetPath", - "paramDescription": "the working path where generated files", - "paramRequired": true - }, - { - "paramName": "ts", - "paramLongName": "timestamp", - "paramDescription": "the timestamp for incremental harvesting", - "paramRequired": true - }, - { - "paramName": "ih", - "paramLongName": "indexHost", - "paramDescription": "the ip name of the index", - "paramRequired": true - }, - { - "paramName": "in", - "paramLongName": "indexName", - "paramDescription": "the name of the index", - "paramRequired": true - } - -] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/scholix_index.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/scholix_index.json deleted file mode 100644 index 02718c1d37..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/scholix_index.json +++ /dev/null @@ -1,331 +0,0 @@ -{ - "mappings": { - "properties": { - "identifier": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "linkprovider": { - "type": "nested", - "properties": { - "identifiers": { - "properties": { - "identifier": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "schema": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "name": { - "type": "keyword" - } - } - }, - "publicationDate": { - "type": "keyword" - }, - "relationship": { - "properties": { - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "schema": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "source": { - "type": "nested", - "properties": { - "collectedFrom": { - "properties": { - "completionStatus": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "provider": { - "properties": { - "identifiers": { - "properties": { - "identifier": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "schema": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "provisionMode": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "creator": { - "properties": { - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "dnetIdentifier": { - "type": "keyword" - }, - "identifier": { - "type": "nested", - "properties": { - "identifier": { - "type": "keyword" - }, - "schema": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "type": { - "type": "keyword" - } - } - }, - "objectType": { - "type": "keyword" - }, - "publicationDate": { - "type": "keyword" - }, - "publisher": { - "type": "nested", - "properties": { - "name": { - "type": "keyword" - } - } - }, - "title": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "target": { - "type": "nested", - "properties": { - "collectedFrom": { - "properties": { - "completionStatus": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "provider": { - "properties": { - "identifiers": { - "properties": { - "identifier": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "schema": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "provisionMode": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "creator": { - "properties": { - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "dnetIdentifier": { - "type": "keyword" - }, - "identifier": { - "type": "nested", - "properties": { - "identifier": { - "type": "keyword" - }, - "schema": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "type": { - "type": "keyword" - } - } - }, - "objectType": { - "type": "keyword" - }, - "publicationDate": { - "type": "keyword" - }, - "publisher": { - "type": "nested", - "properties": { - "name": { - "type": "keyword" - } - } - }, - "title": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - } - } - }, - "settings": { - "index": { - "refresh_interval": "600s", - "number_of_shards": "48", - "translog": { - "sync_interval": "15s", - "durability": "ASYNC" - }, - "analysis": { - "analyzer": { - "analyzer_keyword": { - "filter": "lowercase", - "tokenizer": "keyword" - } - } - }, - "number_of_replicas": "0" - } - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/summary_index.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/summary_index.json deleted file mode 100644 index 1050985437..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/summary_index.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "mappings": { - "properties": { - "abstract": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "author": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "datasources": { - "type": "nested", - "properties": { - "completionStatus": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "datasourceId": { - "type": "keyword" - }, - "datasourceName": { - "type": "keyword" - } - } - }, - "date": { - "type": "keyword" - }, - "id": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "localIdentifier": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "type": { - "type": "keyword" - } - } - }, - "publisher": { - "type": "keyword" - }, - "relatedDatasets": { - "type": "long" - }, - "relatedPublications": { - "type": "long" - }, - "relatedUnknown": { - "type": "long" - }, - "subject": { - "properties": { - "scheme": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "value": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "title": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "typology": { - "type": "keyword" - } - } - }, - "settings": { - "index": { - "refresh_interval": "600s", - "number_of_shards": "48", - "translog": { - "sync_interval": "15s", - "durability": "ASYNC" - }, - "analysis": { - "analyzer": { - "analyzer_keyword": { - "filter": "lowercase", - "tokenizer": "keyword" - } - } - }, - "number_of_replicas": "0" - } - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/export/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/export/oozie_app/config-default.xml deleted file mode 100644 index 59e5c059fc..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/export/oozie_app/config-default.xml +++ /dev/null @@ -1,42 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.wf.rerun.failnodes - false - - - hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - spark2YarnHistoryServerAddress - http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 - - - spark2EventLogDir - /user/spark/spark2ApplicationHistory - - - spark2ExtraListeners - "com.cloudera.spark.lineage.NavigatorAppListener" - - - spark2SqlQueryExecutionListeners - "com.cloudera.spark.lineage.NavigatorQueryListener" - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/export/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/export/oozie_app/workflow.xml deleted file mode 100644 index 181ab80bf6..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/export/oozie_app/workflow.xml +++ /dev/null @@ -1,49 +0,0 @@ - - - - workingDirPath - the source path - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - memory for individual executor - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - yarn-cluster - cluster - ExtractOAF - eu.dnetlib.dhp.export.SparkExportContentForOpenAire - dhp-graph-provision-scholexplorer-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.sql.shuffle.partitions=3840 - ${sparkExtraOPT} - - --workingDirPath${workingDirPath} - --masteryarn-cluster - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/config-default.xml deleted file mode 100644 index 7c1a43e513..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/config-default.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/workflow.xml deleted file mode 100644 index d98164afb2..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/workflow.xml +++ /dev/null @@ -1,86 +0,0 @@ - - - - workingDirPath - the source path - - - index - the index name - - - esCluster - the Index cluster - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.provision.DropAndCreateESIndex - -i${index} - -c${esCluster} - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - index summary - eu.dnetlib.dhp.provision.SparkIndexCollectionOnES - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" - -mt yarn-cluster - --sourcePath${workingDirPath}/summary_json - --index${index}_object - --idPathid - --cluster${esCluster} - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - index scholix - eu.dnetlib.dhp.provision.SparkIndexCollectionOnES - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" - -mt yarn-cluster - --sourcePath${workingDirPath}/scholix_json - --index${index}_scholix - --idPathidentifier - --cluster${esCluster} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/config-default.xml deleted file mode 100644 index 7c1a43e513..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/config-default.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml deleted file mode 100644 index 4c0d6c1da7..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml +++ /dev/null @@ -1,116 +0,0 @@ - - - - workingDirPath - the source path - - - graphPath - the graph path - - - index - the index name - - - esCluster - the Index cluster - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - calculate for each ID the number of related Dataset, publication and Unknown - eu.dnetlib.dhp.provision.SparkExtractRelationCount - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} - -mt yarn-cluster - --workingDirPath${workingDirPath} - --relationPath${graphPath}/relation - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - generate Summary - eu.dnetlib.dhp.provision.SparkGenerateSummaryIndex - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.sql.shuffle.partitions=4000 ${sparkExtraOPT} - -mt yarn-cluster - --workingDirPath${workingDirPath} - --graphPath${graphPath} - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - generate Scholix - eu.dnetlib.dhp.provision.SparkGenerateScholixIndex - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.sql.shuffle.partitions=4000 ${sparkExtraOPT} - -mt yarn-cluster - --workingDirPath${workingDirPath} - --graphPath${graphPath} - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - generate Scholix - eu.dnetlib.dhp.provision.SparkConvertDatasetToJson - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.sql.shuffle.partitions=4000 ${sparkExtraOPT} - -m yarn-cluster - --workingPath${workingDirPath} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/synch/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/synch/oozie_app/config-default.xml deleted file mode 100644 index 7c1a43e513..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/synch/oozie_app/config-default.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/synch/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/synch/oozie_app/workflow.xml deleted file mode 100644 index c004eafe69..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/synch/oozie_app/workflow.xml +++ /dev/null @@ -1,97 +0,0 @@ - - - - workingDirPath - the source path - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - index - index name - - - timestamp - timestamp from incremental harvesting - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.provision.update.RetrieveUpdateFromDatacite - -t${workingDirPath}/synch/input_json - -n${nameNode} - -ts${timestamp} - -ihip-90-147-167-25.ct1.garrservices.it - -indatacite - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - resolve and generate Scholix - eu.dnetlib.dhp.provision.update.SparkResolveScholixTarget - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="32" - -m yarn-cluster - -s${workingDirPath}/synch/input_json - -w${workingDirPath}/synch - -hip-90-147-167-25.ct1.garrservices.it - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - index scholix - eu.dnetlib.dhp.provision.SparkIndexCollectionOnES - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" - -mt yarn-cluster - --sourcePath${workingDirPath}/synch/resolved_json - --index${index}_scholix - --idPathidentifier - --typescholix - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/config-default.xml deleted file mode 100644 index 3b9aaca2a6..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/config-default.xml +++ /dev/null @@ -1,48 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.wf.rerun.failnodes - false - - - hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - spark2YarnHistoryServerAddress - http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 - - - spark2EventLogDir - /user/spark/spark2ApplicationHistory - - - spark2ExtraListeners - "com.cloudera.spark.lineage.NavigatorAppListener" - - - spark2SqlQueryExecutionListeners - "com.cloudera.spark.lineage.NavigatorQueryListener" - - - - oozie.launcher.mapreduce.user.classpath.first - true - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml deleted file mode 100644 index fd8c773c98..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml +++ /dev/null @@ -1,53 +0,0 @@ - - - - sourcePath - the source path - - - targetPath - the target path - - - - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.export.zenodo.MakeTar - -t${targetPath} - -n${nameNode} - -s${sourcePath} - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala deleted file mode 100644 index c62d169bc8..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala +++ /dev/null @@ -1,102 +0,0 @@ -package eu.dnetlib.dhp.export - -import java.time.LocalDateTime -import java.time.format.DateTimeFormatter - -import eu.dnetlib.dhp.provision.scholix.Scholix -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary -import eu.dnetlib.dhp.schema.oaf.Relation -import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication} -import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} -import org.junit.jupiter.api.Test - -import scala.io.Source -import scala.collection.JavaConverters._ -class ExportDLITOOAFTest { - - val mapper = new ObjectMapper() - - @Test - def testDate():Unit = { - println(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))) - - } - - - def extractDatasources(s:Scholix):List[String]= { - s.getTarget.getCollectedFrom.asScala.map(c => c.getProvider.getName)(collection.breakOut) - } - - - def extractDatasources(s:ScholixSummary):List[String] = { - - s.getDatasources.asScala.map(c => c.getDatasourceName)(collection.breakOut) - - - } - - - @Test - def testMappingRele():Unit = { - - val r:Relation = new Relation - r.setSource("60|fbff1d424e045eecf24151a5fe3aa738") - r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877") - r.setRelType("IsReferencedBy") - - - val r1 =DLIToOAF.convertDLIRelation(r) - println(r1.getSource, r1.getTarget) - - } - - @Test - def testPublicationMapping():Unit = { - - mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) - val json = Source.fromInputStream(getClass.getResourceAsStream("publication.json")).mkString - - - val oaf =DLIToOAF.convertDLIPublicationToOAF(mapper.readValue(json, classOf[DLIPublication])) - - println(mapper.writeValueAsString(oaf)) - - - } - - - @Test - def testExternalReferenceMapping():Unit = { - - mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) - val json = Source.fromInputStream(getClass.getResourceAsStream("dataset.json")).mkString - - - val oaf =DLIToOAF.convertDLIDatasetToExternalReference(mapper.readValue(json, classOf[DLIDataset])) - - println(oaf) - - - } - - - - - - - - @Test - def testRelationMapping():Unit = { - - mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) - val json = Source.fromInputStream(getClass.getResourceAsStream("relation.json")).mkString - - - val oaf =mapper.readValue(json, classOf[Relation]) - - println(mapper.writeValueAsString(oaf)) - - - } - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java deleted file mode 100644 index d9cbd22f3f..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java +++ /dev/null @@ -1,50 +0,0 @@ - -package eu.dnetlib.dhp.provision; - -import java.util.List; - -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; -import eu.dnetlib.dhp.provision.update.*; -import eu.dnetlib.scholexplorer.relation.RelationMapper; - -public class DataciteClientTest { - @Test - public void dataciteSCholixTest() throws Exception { - final String json = IOUtils.toString(getClass().getResourceAsStream("datacite.json")); - final RelationMapper mapper = RelationMapper.load(); - - Datacite2Scholix ds = new Datacite2Scholix(mapper); - final List s = ds.generateScholixFromJson(json); - System.out.println(new ObjectMapper().writeValueAsString(s)); - } - - // public void testS() throws Exception { - // RetrieveUpdateFromDatacite.main(new String[]{ - // "-n", "file:///data/new_s2.txt", - // "-t", "/data/new_s2.txt", - // "-ts", "1586974078", - // "-ih", "ip-90-147-167-25.ct1.garrservices.it", - // "-in", "datacite", - // }); - // - // } - - public void testResolveDataset() throws Exception { - DataciteClient dc = new DataciteClient("ip-90-147-167-25.ct1.garrservices.it"); - ScholixResource datasetByDOI = dc.getDatasetByDOI("10.17182/hepdata.15392.v1/t5"); - Assertions.assertNotNull(datasetByDOI); - System.out.println(new ObjectMapper().writeValueAsString(datasetByDOI)); - - CrossrefClient cr = new CrossrefClient("ip-90-147-167-25.ct1.garrservices.it"); - ScholixResource crossrefByDOI = cr.getResourceByDOI("10.26850/1678-4618eqj.v35.1.2010.p41-46"); - Assertions.assertNotNull(crossrefByDOI); - System.out.println(new ObjectMapper().writeValueAsString(crossrefByDOI)); - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DropAndCreateESIndexTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DropAndCreateESIndexTest.java deleted file mode 100644 index 19e8aa6990..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DropAndCreateESIndexTest.java +++ /dev/null @@ -1,13 +0,0 @@ - -package eu.dnetlib.dhp.provision; - -import org.junit.jupiter.api.Test; - -public class DropAndCreateESIndexTest { - - public void testDropAndCreate() throws Exception { - DropAndCreateESIndex.main("-c localhost -i dli_shadow".split(" ")); - - } - -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java deleted file mode 100644 index be97072b57..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java +++ /dev/null @@ -1,30 +0,0 @@ - -package eu.dnetlib.dhp.provision; - -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; - -public class ExtractInfoTest { - @Test - public void testSerialization() throws Exception { - - ScholixSummary summary = new ScholixSummary(); - summary.setDescription("descrizione"); - ObjectMapper mapper = new ObjectMapper(); - String json = mapper.writeValueAsString(summary); - System.out.println(json); - System.out.println(mapper.readValue(json, ScholixSummary.class).getDescription()); - } - - @Test - public void testScholix() throws Exception { - final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json")); - final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json")); - Scholix.generateScholixWithSource(jsonSummary, jsonRelation); - } -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/export/dataset.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/export/dataset.json deleted file mode 100644 index dae6357305..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/export/dataset.json +++ /dev/null @@ -1,101 +0,0 @@ -{ - "dataInfo": { - "invisible": false, - "inferred": null, - "deletedbyinference": false, - "trust": "0.9", - "inferenceprovenance": null, - "provenanceaction": null - }, - "lastupdatetimestamp": null, - "id": "60|719f19e5a996de1b87cddf93871bf2d4", - "originalId": [ - "a0a3p2gws9::uniprot" - ], - "collectedfrom": [ - { - "key": "dli_________::europe_pmc__", - "value": "Europe PMC", - "dataInfo": null - } - ], - "pid": [ - { - "value": "acc63471", - "qualifier": { - "classid": "ena", - "classname": "ena", - "schemeid": "dnet:pid_types", - "schemename": "dnet:pid_types" - }, - "dataInfo": null - } - ], - "dateofcollection": "2019-07-05T12:47:11.545+02:00", - "dateoftransformation": null, - "extraInfo": null, - "oaiprovenance": null, - "author": null, - "resulttype": { - "classid": "dataset", - "classname": "dataset", - "schemeid": "dataset", - "schemename": "dataset" - }, - "language": null, - "country": null, - "subject": [], - "title": [ - { - "value": "CMD domain-containing protein", - "qualifier": null, - "dataInfo": null - } - ], - "relevantdate": [ - { - "value": "2019-07-15T16:14:28.636", - "qualifier": { - "classid": "resolvedDate", - "classname": "resolvedDate", - "schemeid": "dnet::date", - "schemename": "dnet::date" - }, - "dataInfo": null - } - ], - "description": null, - "dateofacceptance": null, - "publisher": { - "value": "UniProt", - "dataInfo": null - }, - "embargoenddate": null, - "source": null, - "fulltext": null, - "format": null, - "contributor": null, - "resourcetype": null, - "coverage": null, - "bestaccessright": null, - "context": null, - "externalReference": null, - "instance": [], - "storagedate": null, - "device": null, - "size": null, - "version": null, - "lastmetadataupdate": null, - "metadataversionnumber": null, - "geolocation": null, - "originalObjIdentifier": "europe_pmc__::719f19e5a996de1b87cddf93871bf2d4", - "dlicollectedfrom": [ - { - "id": "dli_________::europe_pmc__", - "name": "Europe PMC", - "completionStatus": "complete", - "collectionMode": null - } - ], - "completionStatus": "complete" -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/export/publication.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/export/publication.json deleted file mode 100644 index 4ab3de2da0..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/export/publication.json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "dataInfo": { - "invisible": false, - "inferred": null, - "deletedbyinference": false, - "trust": "0.9", - "inferenceprovenance": null, - "provenanceaction": null - }, - "lastupdatetimestamp": null, - "id": "50|9e117414be07bf03cbce8889d22d661a", - "originalId": [ - "9e117414be07bf03cbce8889d22d661a" - ], - "collectedfrom": [ - { - "key": "dli_________::crossref", - "value": "Crossref", - "dataInfo": null - } - ], - "pid": [ - { - "value": "10.1007/978-94-017-3490-5_15", - "qualifier": { - "classid": "doi", - "classname": "doi", - "schemeid": "dnet:pid_types", - "schemename": "dnet:pid_types" - }, - "dataInfo": null - } - ], - "dateofcollection": "2020-06-08T07:28:55.731Z", - "dateoftransformation": null, - "extraInfo": null, - "oaiprovenance": null, - "author": [ - { - "fullname": "Calcaterra Domenico", - "name": null, - "surname": null, - "rank": null, - "pid": null, - "affiliation": null - }, - { - "fullname": "Parise Mario", - "name": null, - "surname": null, - "rank": null, - "pid": null, - "affiliation": null - } - ], - "resulttype": { - "classid": "publication", - "classname": "publication", - "schemeid": "publication", - "schemename": "publication" - }, - "language": null, - "country": null, - "subject":[ - { - "value":"Strain-linked information about bacterial and archaeal biodiversity", - "qualifier":{ - "classid":"dnet:subject", - "classname":"dnet:subject", - "schemeid":"", - "schemename":"" - }, - "dataInfo":null - } - ], - "title": [ - { - "value": "The Contribution of Historical Information in the Assessment of Landslide Hazard", - "qualifier": null, - "dataInfo": null - } - ], - "relevantdate": [ - { - "value": "2013-01-29T16:50:44Z", - "qualifier": { - "classid": "date", - "classname": "date", - "schemeid": "dnet::date", - "schemename": "dnet::date" - }, - "dataInfo": null - } - ], - "description": [ - { - "value": null, - "dataInfo": null - } - ], - "dateofacceptance": null, - "publisher": { - "value": "Springer Netherlands", - "dataInfo": null - }, - "embargoenddate": null, - "source": null, - "fulltext": null, - "format": null, - "contributor": null, - "resourcetype": null, - "coverage": null, - "bestaccessright": null, - "context": null, - "externalReference": null, - "instance": [], - "journal": null, - "originalObjIdentifier": "dli_resolver::9e117414be07bf03cbce8889d22d661a", - "dlicollectedfrom": [ - { - "id": "dli_________::crossref", - "name": "Crossref", - "completionStatus": "complete", - "collectionMode": "resolved" - } - ], - "completionStatus": "complete" -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/export/relation.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/export/relation.json deleted file mode 100644 index 7aa25525ee..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/export/relation.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "subRelType": null, - "relClass": "datacite", - "dataInfo": { - "deletedbyinference": false, - "provenanceaction": null, - "inferred": null, - "inferenceprovenance": null, - "invisible": false, - "trust": "0.9" - }, - "target": "50|00062410e2a15322480277d063c181bb", - "lastupdatetimestamp": null, - "relType": "IsReferencedBy", - "source": "60|4ee78ab329b49416b45c3774c132f244", - "collectedfrom": [ - { - "dataInfo": null, - "value": "Europe PMC", - "key": "dli_________::europe_pmc__" - } - ] -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/datacite.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/datacite.json deleted file mode 100644 index f23fa314e9..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/datacite.json +++ /dev/null @@ -1,136 +0,0 @@ -{ - "relationships": { - "client": { - "data": { - "type": "clients", - "id": "crossref.citations" - } - } - }, - "attributes": { - "contributors": [ - ], - "titles": [ - { - "title": "UV-visible spectroscopy in the interpretation of the tautomeric equilibrium of N,N′(bis-3,5-di-bromo-salicyliden)-1,2-diaminobenzene and the redox activity of its Co(II) complex. A quantum chemical approach." - } - ], - "descriptions": [ - ], - "referenceCount": 0, - "subjects": [ - ], - "container": { - "title": "Journal of Molecular Structure: THEOCHEM", - "firstPage": "97", - "volume": "367", - "lastPage": "110", - "identifierType": "ISSN", - "identifier": "0166-1280", - "type": "Journal" - }, - "state": "findable", - "created": "2020-03-26T13:31:57.000Z", - "source": "levriero", - "metadataVersion": 0, - "version": null, - "isActive": true, - "contentUrl": null, - "geoLocations": [ - ], - "updated": "2020-03-26T13:31:58.000Z", - "fundingReferences": [ - ], - "viewCount": 0, - "registered": "2020-03-26T13:31:58.000Z", - "published": "1996", - "dates": [ - { - "date": "1996-09", - "dateType": "Issued" - }, - { - "date": "2019-04-17T13:58:25Z", - "dateType": "Updated" - } - ], - "relatedIdentifiers": [ - { - "relationType": "IsPartOf", - "relatedIdentifier": "0166-1280", - "relatedIdentifierType": "ISSN", - "resourceTypeGeneral": "Collection" - } - ], - "reason": null, - "rightsList": [ - { - "rightsUri": "https://www.elsevier.com/tdm/userlicense/1.0" - } - ], - "schemaVersion": "http://datacite.org/schema/kernel-4", - "types": { - "resourceType": "JournalArticle", - "ris": "JOUR", - "resourceTypeGeneral": "Text", - "bibtex": "article", - "citeproc": "article-journal", - "schemaOrg": "ScholarlyArticle" - }, - "publisher": "Elsevier BV", - "publicationYear": 1996, - "doi": "10.1016/s0166-1280(96)04575-7", - "language": null, - "sizes": [ - ], - "url": "https://linkinghub.elsevier.com/retrieve/pii/S0166128096045757", - "identifiers": [ - { - "identifier": "https://doi.org/10.1016/s0166-1280(96)04575-7", - "identifierType": "DOI" - }, - { - "identifier": "S0166128096045757", - "identifierType": "Publisher ID" - } - ], - "citationCount": 0, - "formats": [ - ], - "downloadCount": 0, - "creators": [ - { - "nameType": "Personal", - "givenName": "G.L.", - "name": "Estiú, G.L.", - "familyName": "Estiú", - "affiliation": [ - ] - }, - { - "nameType": "Personal", - "givenName": "A.H.", - "name": "Jubert, A.H.", - "familyName": "Jubert", - "affiliation": [ - ] - }, - { - "nameType": "Personal", - "givenName": "J.", - "name": "Costamagna, J.", - "familyName": "Costamagna", - "affiliation": [ - ] - }, - { - "nameType": "Personal", - "givenName": "J.", - "name": "Vargas, J.", - "familyName": "Vargas", - "affiliation": [ - ] - } - ] - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/es.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/es.json deleted file mode 100644 index 7520f5711b..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/es.json +++ /dev/null @@ -1,2191 +0,0 @@ -{ - "_scroll_id":"DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAA3ZFjJyVjB4UWROUlV1NEZvcGFSNUFFNEEAAAAAAAAMKxZwaDlETnJ4alJDeUZYdGlLck9XQ3dBAAAAAAAADCkWRHhuQi1obllTOFdVYWtWeUN2SzdaUQAAAAAAAq-XFjNsWU1LeTlrVGhPZERWRW9iWEZ5QlEAAAAAAAAOQBY5R0Y2TXNyRFM3bUJkMHE4SkVQZmNR", - "took":16, - "timed_out":false, - "_shards":{ - "total":5, - "successful":5, - "skipped":0, - "failed":0 - }, - "hits":{ - "total":3281, - "max_score":1.0, - "hits":[ - { - "_index":"datacite", - "_type":"dump", - "_id":"10.17605/osf.io/vkdyt", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"cos.osf" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"COVID-19-RAA" - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - - }, - "state":"findable", - "created":"2020-03-24T20:15:45.000Z", - "source":"mds", - "metadataVersion":20, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:36:41.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-24T20:15:47.000Z", - "published":"2020", - "dates":[ - { - "date":"2020", - "dateType":"Issued" - } - ], - "relatedIdentifiers":[ - - ], - "reason":null, - "rightsList":[ - - ], - "schemaVersion":null, - "types":{ - "resourceType":"Project", - "ris":"RPRT", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"Open Science Framework", - "publicationYear":2020, - "doi":"10.17605/osf.io/vkdyt", - "language":null, - "sizes":[ - - ], - "url":"https://osf.io/vkdyt/", - "identifiers":[ - { - "identifier":"https://doi.org/10.17605/osf.io/vkdyt", - "identifierType":"DOI" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"Gjalt - Jorn", - "name":"Peters, Gjalt - Jorn", - "familyName":"Peters", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Sylvia", - "name":"Roozen, Sylvia", - "familyName":"Roozen", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Rik", - "name":"Crutzen, Rik", - "familyName":"Crutzen", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Gill", - "name":"Hoor, Gill", - "familyName":"Hoor", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Sander", - "name":"Hermsen, Sander", - "familyName":"Hermsen", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Lisa-Graham", - "name":"Wisener, Lisa-Graham", - "familyName":"Wisener", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Laura", - "name":"König, Laura", - "familyName":"König", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Alexis", - "name":"Ruffault, Alexis", - "familyName":"Ruffault", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Jennifer", - "name":"Inauen, Jennifer", - "familyName":"Inauen", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Marta", - "name":"Marques, Marta", - "familyName":"Marques", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Chris", - "name":"Noone, Chris", - "familyName":"Noone", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Lucy", - "name":"Porter, Lucy", - "familyName":"Porter", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Samantha", - "name":"van Beurden, Samantha", - "familyName":"van Beurden", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Ann", - "name":"DeSmet, Ann", - "familyName":"DeSmet", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Ratri", - "name":"Nurwanti, Ratri", - "familyName":"Nurwanti", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Yasinta", - "name":"Sokang, Yasinta", - "familyName":"Sokang", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Alexandra", - "name":"Dima, Alexandra", - "familyName":"Dima", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Gabriele", - "name":"Pierantoni, Gabriele", - "familyName":"Pierantoni", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Angelos", - "name":"Kassianos, Angelos", - "familyName":"Kassianos", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"James", - "name":"Green, James", - "familyName":"Green", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Daniel", - "name":"Powell, Daniel", - "familyName":"Powell", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Thomas", - "name":"Gültzow, Thomas", - "familyName":"Gültzow", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Stan", - "name":"Vluggen, Stan", - "familyName":"Vluggen", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Roel", - "name":"Hermans, Roel", - "familyName":"Hermans", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Laura", - "name":"Eito, Laura", - "familyName":"Eito", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Lisa", - "name":"Warner, Lisa", - "familyName":"Warner", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Lena", - "name":"Fleig, Lena", - "familyName":"Fleig", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Emma", - "name":"Berry, Emma", - "familyName":"Berry", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Daniela", - "name":"Lange, Daniela", - "familyName":"Lange", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Pierre", - "name":"Gérain, Pierre", - "familyName":"Gérain", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Annick", - "name":"De Paepe, Annick", - "familyName":"De Paepe", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Louise", - "name":"Poppe, Louise", - "familyName":"Poppe", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Eva", - "name":"Papakonstantinou, Eva", - "familyName":"Papakonstantinou", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Hanna", - "name":"de Paredes, Hanna", - "familyName":"de Paredes", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Jorge", - "name":"Encantado, Jorge", - "familyName":"Encantado", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Carolina", - "name":"Silva, Carolina", - "familyName":"Silva", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Tracy", - "name":"Epton, Tracy", - "familyName":"Epton", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Jenny", - "name":"Groarke, Jenny", - "familyName":"Groarke", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Sarah", - "name":"Denford, Sarah", - "familyName":"Denford", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Barbara", - "name":"Mullan, Barbara", - "familyName":"Mullan", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Rebecca", - "name":"Pedruzzi, Rebecca", - "familyName":"Pedruzzi", - "affiliation":[ - - ] - } - ] - }, - "type":"dois", - "id":"10.17605/osf.io/vkdyt", - "timestamp":1585226201 - } - }, - { - "_index":"datacite", - "_type":"dump", - "_id":"10.1080/13510340500378274", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"crossref.citations" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"Meteoric trajectory: The Res Publica Party in Estonia" - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - "title":"Democratization", - "type":"Journal", - "firstPage":"78", - "volume":"13", - "lastPage":"94", - "identifierType":"ISSN", - "identifier":"1743-890X", - "issue":"1" - }, - "state":"findable", - "created":"2020-03-26T13:31:47.000Z", - "source":"levriero", - "metadataVersion":0, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:31:54.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-26T13:31:47.000Z", - "published":"2006", - "dates":[ - { - "date":"2006-02", - "dateType":"Issued" - }, - { - "date":"2016-12-13T05:03:17Z", - "dateType":"Updated" - } - ], - "relatedIdentifiers":[ - { - "relationType":"IsPartOf", - "relatedIdentifier":"1743-890X", - "relatedIdentifierType":"ISSN", - "resourceTypeGeneral":"Collection" - } - ], - "reason":null, - "rightsList":[ - - ], - "schemaVersion":"http://datacite.org/schema/kernel-4", - "types":{ - "resourceType":"JournalArticle", - "ris":"JOUR", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"Informa UK Limited", - "publicationYear":2006, - "doi":"10.1080/13510340500378274", - "language":null, - "sizes":[ - - ], - "url":"http://www.tandfonline.com/doi/abs/10.1080/13510340500378274", - "identifiers":[ - { - "identifier":"https://doi.org/10.1080/13510340500378274", - "identifierType":"DOI" - }, - { - "identifier":"5", - "identifierType":"Publisher ID" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"Rein", - "name":"Taagepera, Rein", - "familyName":"Taagepera", - "affiliation":[ - - ] - } - ] - }, - "type":"dois", - "id":"10.1080/13510340500378274", - "timestamp":1585225914 - } - }, - { - "_index":"datacite", - "_type":"dump", - "_id":"10.1029/2001gl012966", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"crossref.citations" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"Use of 17 O/ 16 O to trace atmospherically-deposited sulfate in surface waters: A case study in Alpine watersheds in the Rocky Mountains" - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - "title":"Geophysical Research Letters", - "type":"Journal", - "firstPage":"4483", - "volume":"28", - "lastPage":"4486", - "identifierType":"ISSN", - "identifier":"0094-8276", - "issue":"23" - }, - "state":"findable", - "created":"2020-03-26T13:31:52.000Z", - "source":"levriero", - "metadataVersion":0, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:31:53.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-26T13:31:53.000Z", - "published":"2001", - "dates":[ - { - "date":"2001-12-01", - "dateType":"Issued" - }, - { - "date":"2018-04-17T02:49:52Z", - "dateType":"Updated" - } - ], - "relatedIdentifiers":[ - { - "relationType":"IsPartOf", - "relatedIdentifier":"0094-8276", - "relatedIdentifierType":"ISSN", - "resourceTypeGeneral":"Collection" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/s0016-7037(00)00490-7", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1007/978-1-4612-2788-5", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/95wr02037", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/0016-7037(63)90071-1", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/0012-821x(83)90066-3", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/91wr01243", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1126/science.217.4554.51", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/91jd01943", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1126/science.177.4048.514", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/97jd02075", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/2000jd900805", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/2000gl011826", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/s1352-2310(00)00507-0", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/1999wr900276", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/b978-0-444-81546-0.50022-7", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/s1352-2310(99)00122-3", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/2000jd900456", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/94gl00893", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1126/science.283.5400.341", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1007/978-1-4612-3616-0_6", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1130/0016-7606(1963)74[991:paotcm]2.0.co;2", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.2113/gsecongeo.87.2.225", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1029/90wr02774", - "relatedIdentifierType":"DOI" - } - ], - "reason":null, - "rightsList":[ - { - "rightsUri":"http://doi.wiley.com/10.1002/tdm_license_1.1" - } - ], - "schemaVersion":"http://datacite.org/schema/kernel-4", - "types":{ - "resourceType":"JournalArticle", - "ris":"JOUR", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"American Geophysical Union (AGU)", - "publicationYear":2001, - "doi":"10.1029/2001gl012966", - "language":null, - "sizes":[ - - ], - "url":"http://doi.wiley.com/10.1029/2001GL012966", - "identifiers":[ - { - "identifier":"https://doi.org/10.1029/2001gl012966", - "identifierType":"DOI" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"Craig A.", - "name":"Johnson, Craig A.", - "familyName":"Johnson", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"M. Alisa", - "name":"Mast, M. Alisa", - "familyName":"Mast", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Cynthia L.", - "name":"Kester, Cynthia L.", - "familyName":"Kester", - "affiliation":[ - - ] - } - ] - }, - "type":"dois", - "id":"10.1029/2001gl012966", - "timestamp":1585225913 - } - }, - { - "_index":"datacite", - "_type":"dump", - "_id":"10.4094/chnr.2014.20.4.294", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"crossref.citations" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"Critical Thinking Disposition, Problem Solving Process, and Simulation-Based Assessment of Clinical Competence of Nursing Students in Pediatric Nursing" - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - "title":"Child Health Nursing Research", - "type":"Journal", - "firstPage":"294", - "volume":"20", - "identifierType":"ISSN", - "identifier":"2287-9129", - "issue":"4" - }, - "state":"findable", - "created":"2020-03-26T13:31:56.000Z", - "source":"levriero", - "metadataVersion":0, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:31:56.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-26T13:31:56.000Z", - "published":"2014", - "dates":[ - { - "date":"2014", - "dateType":"Issued" - }, - { - "date":"2019-08-17T03:06:14Z", - "dateType":"Updated" - } - ], - "relatedIdentifiers":[ - { - "relationType":"IsPartOf", - "relatedIdentifier":"2287-9129", - "relatedIdentifierType":"ISSN", - "resourceTypeGeneral":"Collection" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/j.nedt.2010.10.013", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.3928/0148-4834-20001101-09", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.3310/hta13270", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.5977/jkasne.2013.19.2.228", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/j.teln.2008.07.004", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.4040/jkan.2011.41.2.245", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.4040/jkan.2006.36.6.950", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.5977/jkasne.2011.17.2.226", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.4040/jkan.2011.41.4.433", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.7748/ns2010.05.24.35.42.c7751", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1053/j.nainr.2009.03.006", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.5977/jkasne.2009.15.2.149", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.5124/jkma.2005.48.3.267", - "relatedIdentifierType":"DOI" - } - ], - "reason":null, - "rightsList":[ - { - "rightsUri":"http://creativecommons.org/licenses/by-nc/3.0" - } - ], - "schemaVersion":"http://datacite.org/schema/kernel-4", - "types":{ - "resourceType":"JournalArticle", - "ris":"JOUR", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"Korean Academy of Child Health Nursing", - "publicationYear":2014, - "doi":"10.4094/chnr.2014.20.4.294", - "language":null, - "sizes":[ - - ], - "url":"http://e-chnr.org/journal/view.php?id=10.4094/chnr.2014.20.4.294", - "identifiers":[ - { - "identifier":"https://doi.org/10.4094/chnr.2014.20.4.294", - "identifierType":"DOI" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"Sunghee", - "name":"Kim, Sunghee", - "familyName":"Kim", - "affiliation":[ - "Red Cross College of Nursing, Chung-Ang University, Seoul, Korea." - ] - }, - { - "nameType":"Personal", - "givenName":"Hyuna", - "name":"Nam, Hyuna", - "familyName":"Nam", - "affiliation":[ - "Department of Nursing, Pyeongtaek University, Pyeongtaek, Korea." - ] - }, - { - "nameType":"Personal", - "givenName":"Miok", - "name":"Kim, Miok", - "familyName":"Kim", - "affiliation":[ - "Department of Nursing, Namseoul University, Cheonan, Korea." - ] - } - ] - }, - "type":"dois", - "id":"10.4094/chnr.2014.20.4.294", - "timestamp":1585225916 - } - }, - { - "_index":"datacite", - "_type":"dump", - "_id":"10.1016/s0166-1280(96)04575-7", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"crossref.citations" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"UV-visible spectroscopy in the interpretation of the tautomeric equilibrium of N,N′(bis-3,5-di-bromo-salicyliden)-1,2-diaminobenzene and the redox activity of its Co(II) complex. A quantum chemical approach." - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - "title":"Journal of Molecular Structure: THEOCHEM", - "firstPage":"97", - "volume":"367", - "lastPage":"110", - "identifierType":"ISSN", - "identifier":"0166-1280", - "type":"Journal" - }, - "state":"findable", - "created":"2020-03-26T13:31:57.000Z", - "source":"levriero", - "metadataVersion":0, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:31:58.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-26T13:31:58.000Z", - "published":"1996", - "dates":[ - { - "date":"1996-09", - "dateType":"Issued" - }, - { - "date":"2019-04-17T13:58:25Z", - "dateType":"Updated" - } - ], - "relatedIdentifiers":[ - { - "relationType":"IsPartOf", - "relatedIdentifier":"0166-1280", - "relatedIdentifierType":"ISSN", - "resourceTypeGeneral":"Collection" - } - ], - "reason":null, - "rightsList":[ - { - "rightsUri":"https://www.elsevier.com/tdm/userlicense/1.0" - } - ], - "schemaVersion":"http://datacite.org/schema/kernel-4", - "types":{ - "resourceType":"JournalArticle", - "ris":"JOUR", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"Elsevier BV", - "publicationYear":1996, - "doi":"10.1016/s0166-1280(96)04575-7", - "language":null, - "sizes":[ - - ], - "url":"https://linkinghub.elsevier.com/retrieve/pii/S0166128096045757", - "identifiers":[ - { - "identifier":"https://doi.org/10.1016/s0166-1280(96)04575-7", - "identifierType":"DOI" - }, - { - "identifier":"S0166128096045757", - "identifierType":"Publisher ID" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"G.L.", - "name":"Estiú, G.L.", - "familyName":"Estiú", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"A.H.", - "name":"Jubert, A.H.", - "familyName":"Jubert", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"J.", - "name":"Costamagna, J.", - "familyName":"Costamagna", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"J.", - "name":"Vargas, J.", - "familyName":"Vargas", - "affiliation":[ - - ] - } - ] - }, - "type":"dois", - "id":"10.1016/s0166-1280(96)04575-7", - "timestamp":1585225918 - } - }, - { - "_index":"datacite", - "_type":"dump", - "_id":"10.1016/j.jhin.2013.12.002", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"crossref.citations" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"Consequences of incomplete measles vaccine uptake in healthcare workers during an outbreak in North East England" - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - "title":"Journal of Hospital Infection", - "type":"Journal", - "firstPage":"144", - "volume":"86", - "lastPage":"146", - "identifierType":"ISSN", - "identifier":"0195-6701", - "issue":"2" - }, - "state":"findable", - "created":"2020-03-26T13:31:58.000Z", - "source":"levriero", - "metadataVersion":0, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:31:58.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-26T13:31:58.000Z", - "published":"2014", - "dates":[ - { - "date":"2014-02", - "dateType":"Issued" - }, - { - "date":"2018-10-07T23:20:19Z", - "dateType":"Updated" - } - ], - "relatedIdentifiers":[ - { - "relationType":"IsPartOf", - "relatedIdentifier":"0195-6701", - "relatedIdentifierType":"ISSN", - "resourceTypeGeneral":"Collection" - } - ], - "reason":null, - "rightsList":[ - { - "rightsUri":"https://www.elsevier.com/tdm/userlicense/1.0" - } - ], - "schemaVersion":"http://datacite.org/schema/kernel-4", - "types":{ - "resourceType":"JournalArticle", - "ris":"JOUR", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"Elsevier BV", - "publicationYear":2014, - "doi":"10.1016/j.jhin.2013.12.002", - "language":null, - "sizes":[ - - ], - "url":"https://linkinghub.elsevier.com/retrieve/pii/S0195670113004052", - "identifiers":[ - { - "identifier":"https://doi.org/10.1016/j.jhin.2013.12.002", - "identifierType":"DOI" - }, - { - "identifier":"S0195670113004052", - "identifierType":"Publisher ID" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"P.", - "name":"Bogowicz, P.", - "familyName":"Bogowicz", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"J.", - "name":"Waller, J.", - "familyName":"Waller", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"D.", - "name":"Wilson, D.", - "familyName":"Wilson", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"K.", - "name":"Foster, K.", - "familyName":"Foster", - "affiliation":[ - - ] - } - ] - }, - "type":"dois", - "id":"10.1016/j.jhin.2013.12.002", - "timestamp":1585225918 - } - }, - { - "_index":"datacite", - "_type":"dump", - "_id":"10.1186/s12871-015-0149-y", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"crossref.citations" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"Normal values for pancreatic stone protein in different age groups" - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - "title":"BMC Anesthesiology", - "type":"Journal", - "volume":"15", - "identifierType":"ISSN", - "identifier":"1471-2253", - "issue":"1" - }, - "state":"findable", - "created":"2020-03-26T13:32:00.000Z", - "source":"levriero", - "metadataVersion":0, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:32:00.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-26T13:32:00.000Z", - "published":"2015", - "dates":[ - { - "date":"2015-11-20", - "dateType":"Issued" - }, - { - "date":"2017-06-23T20:02:57Z", - "dateType":"Updated" - } - ], - "relatedIdentifiers":[ - { - "relationType":"IsPartOf", - "relatedIdentifier":"1471-2253", - "relatedIdentifierType":"ISSN", - "resourceTypeGeneral":"Collection" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1186/cc12588", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1038/tpj.2012.1", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1164/rccm.201201-0037oc", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1097/01.mop.0000193293.87022.4c", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/s1473-3099(04)01146-6", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1159/000241296", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1097/01.ccm.0000159089.16462.4a", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1378/chest.11-0018", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1097/ccm.0b013e31819da7d6", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1007/s00134-012-2798-3", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/j.jss.2005.09.030", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1186/1471-2431-10-89", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1210/jc.2014-2244", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/j.imbio.2012.06.001", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1111/j.1399-3038.2010.01104.x", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1186/cc11406", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1097/ccm.0b013e3182771193", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1016/j.cyto.2014.01.009", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1378/chest.12-0730", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1136/bmjopen-2014-004914", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1086/653531", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1210/endo.136.5.7720628", - "relatedIdentifierType":"DOI" - }, - { - "relationType":"References", - "relatedIdentifier":"10.1111/j.1651-2227.2002.tb01645.x", - "relatedIdentifierType":"DOI" - } - ], - "reason":null, - "rightsList":[ - { - "rightsUri":"http://www.springer.com/tdm" - } - ], - "schemaVersion":"http://datacite.org/schema/kernel-4", - "types":{ - "resourceType":"JournalArticle", - "ris":"JOUR", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"Springer Science and Business Media LLC", - "publicationYear":2015, - "doi":"10.1186/s12871-015-0149-y", - "language":null, - "sizes":[ - - ], - "url":"http://bmcanesthesiol.biomedcentral.com/articles/10.1186/s12871-015-0149-y", - "identifiers":[ - { - "identifier":"https://doi.org/10.1186/s12871-015-0149-y", - "identifierType":"DOI" - }, - { - "identifier":"168", - "identifierType":"Publisher ID" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"Luregn J", - "name":"Schlapbach, Luregn J", - "familyName":"Schlapbach", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Eric", - "name":"Giannoni, Eric", - "familyName":"Giannoni", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Sven", - "name":"Wellmann, Sven", - "familyName":"Wellmann", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Martin", - "name":"Stocker, Martin", - "familyName":"Stocker", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Roland A", - "name":"Ammann, Roland A", - "familyName":"Ammann", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Rolf", - "name":"Graf, Rolf", - "familyName":"Graf", - "affiliation":[ - - ] - } - ] - }, - "type":"dois", - "id":"10.1186/s12871-015-0149-y", - "timestamp":1585225920 - } - }, - { - "_index":"datacite", - "_type":"dump", - "_id":"10.1097/fch.0000000000000237", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"crossref.citations" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"Concordance and Discordance of the Knowledge, Understanding, and Description of Childrenʼs Experience of Food Insecurity Among Hispanic Adults and Children" - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - "title":"Family & Community Health", - "type":"Journal", - "firstPage":"237", - "volume":"42", - "lastPage":"244", - "identifierType":"ISSN", - "identifier":"0160-6379", - "issue":"4" - }, - "state":"findable", - "created":"2020-03-26T13:32:19.000Z", - "source":"levriero", - "metadataVersion":0, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:32:19.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-26T13:32:19.000Z", - "published":"2019", - "dates":[ - { - "date":"2019", - "dateType":"Issued" - }, - { - "date":"2020-02-18T14:54:24Z", - "dateType":"Updated" - } - ], - "relatedIdentifiers":[ - { - "relationType":"IsPartOf", - "relatedIdentifier":"0160-6379", - "relatedIdentifierType":"ISSN", - "resourceTypeGeneral":"Collection" - } - ], - "reason":null, - "rightsList":[ - - ], - "schemaVersion":"http://datacite.org/schema/kernel-4", - "types":{ - "resourceType":"JournalArticle", - "ris":"JOUR", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"Ovid Technologies (Wolters Kluwer Health)", - "publicationYear":2019, - "doi":"10.1097/fch.0000000000000237", - "language":null, - "sizes":[ - - ], - "url":"http://journals.lww.com/00003727-201910000-00002", - "identifiers":[ - { - "identifier":"https://doi.org/10.1097/fch.0000000000000237", - "identifierType":"DOI" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"Edward A.", - "name":"Frongillo, Edward A.", - "familyName":"Frongillo", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Maryah S.", - "name":"Fram, Maryah S.", - "familyName":"Fram", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Jessica L.", - "name":"Escobar-Alegría, Jessica L.", - "familyName":"Escobar-Alegría", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Melly", - "name":"Pérez-Garay, Melly", - "familyName":"Pérez-Garay", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Mark M.", - "name":"Macauda, Mark M.", - "familyName":"Macauda", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Deborah L.", - "name":"Billings, Deborah L.", - "familyName":"Billings", - "affiliation":[ - - ] - } - ] - }, - "type":"dois", - "id":"10.1097/fch.0000000000000237", - "timestamp":1585225939 - } - }, - { - "_index":"datacite", - "_type":"dump", - "_id":"10.1016/s0967-0637(97)00068-x", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"crossref.citations" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"Characteristics of the South Atlantic subtropical frontal zone between 15°W and 5°E" - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - "title":"Deep Sea Research Part I: Oceanographic Research Papers", - "type":"Journal", - "firstPage":"167", - "volume":"45", - "lastPage":"192", - "identifierType":"ISSN", - "identifier":"0967-0637", - "issue":"1" - }, - "state":"findable", - "created":"2020-03-26T13:32:22.000Z", - "source":"levriero", - "metadataVersion":0, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:32:23.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-26T13:32:23.000Z", - "published":"1998", - "dates":[ - { - "date":"1998-01", - "dateType":"Issued" - }, - { - "date":"2019-04-22T21:55:07Z", - "dateType":"Updated" - } - ], - "relatedIdentifiers":[ - { - "relationType":"IsPartOf", - "relatedIdentifier":"0967-0637", - "relatedIdentifierType":"ISSN", - "resourceTypeGeneral":"Collection" - } - ], - "reason":null, - "rightsList":[ - { - "rightsUri":"https://www.elsevier.com/tdm/userlicense/1.0" - } - ], - "schemaVersion":"http://datacite.org/schema/kernel-4", - "types":{ - "resourceType":"JournalArticle", - "ris":"JOUR", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"Elsevier BV", - "publicationYear":1998, - "doi":"10.1016/s0967-0637(97)00068-x", - "language":null, - "sizes":[ - - ], - "url":"https://linkinghub.elsevier.com/retrieve/pii/S096706379700068X", - "identifiers":[ - { - "identifier":"https://doi.org/10.1016/s0967-0637(97)00068-x", - "identifierType":"DOI" - }, - { - "identifier":"S096706379700068X", - "identifierType":"Publisher ID" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"D.", - "name":"Smythe-Wright, D.", - "familyName":"Smythe-Wright", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"P.", - "name":"Chapman, P.", - "familyName":"Chapman", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"C.Duncombe", - "name":"Rae, C.Duncombe", - "familyName":"Rae", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"L.V.", - "name":"Shannon, L.V.", - "familyName":"Shannon", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"S.M.", - "name":"Boswell, S.M.", - "familyName":"Boswell", - "affiliation":[ - - ] - } - ] - }, - "type":"dois", - "id":"10.1016/s0967-0637(97)00068-x", - "timestamp":1585225943 - } - }, - { - "_index":"datacite", - "_type":"dump", - "_id":"10.1016/j.elecom.2011.05.032", - "_score":1.0, - "_source":{ - "relationships":{ - "client":{ - "data":{ - "type":"clients", - "id":"crossref.citations" - } - } - }, - "attributes":{ - "contributors":[ - - ], - "titles":[ - { - "title":"ZnO/NiO nanocomposite electrodes for low-temperature solid oxide fuel cells" - } - ], - "descriptions":[ - - ], - "referenceCount":0, - "subjects":[ - - ], - "container":{ - "title":"Electrochemistry Communications", - "type":"Journal", - "firstPage":"917", - "volume":"13", - "lastPage":"920", - "identifierType":"ISSN", - "identifier":"1388-2481", - "issue":"9" - }, - "state":"findable", - "created":"2020-03-26T13:32:40.000Z", - "source":"levriero", - "metadataVersion":0, - "version":null, - "isActive":true, - "contentUrl":null, - "geoLocations":[ - - ], - "updated":"2020-03-26T13:32:40.000Z", - "fundingReferences":[ - - ], - "viewCount":0, - "registered":"2020-03-26T13:32:40.000Z", - "published":"2011", - "dates":[ - { - "date":"2011-09", - "dateType":"Issued" - }, - { - "date":"2018-12-01T18:20:58Z", - "dateType":"Updated" - } - ], - "relatedIdentifiers":[ - { - "relationType":"IsPartOf", - "relatedIdentifier":"1388-2481", - "relatedIdentifierType":"ISSN", - "resourceTypeGeneral":"Collection" - } - ], - "reason":null, - "rightsList":[ - { - "rightsUri":"https://www.elsevier.com/tdm/userlicense/1.0" - } - ], - "schemaVersion":"http://datacite.org/schema/kernel-4", - "types":{ - "resourceType":"JournalArticle", - "ris":"JOUR", - "resourceTypeGeneral":"Text", - "bibtex":"article", - "citeproc":"article-journal", - "schemaOrg":"ScholarlyArticle" - }, - "publisher":"Elsevier BV", - "publicationYear":2011, - "doi":"10.1016/j.elecom.2011.05.032", - "language":null, - "sizes":[ - - ], - "url":"https://linkinghub.elsevier.com/retrieve/pii/S138824811100230X", - "identifiers":[ - { - "identifier":"https://doi.org/10.1016/j.elecom.2011.05.032", - "identifierType":"DOI" - }, - { - "identifier":"S138824811100230X", - "identifierType":"Publisher ID" - } - ], - "citationCount":0, - "formats":[ - - ], - "downloadCount":0, - "creators":[ - { - "nameType":"Personal", - "givenName":"Rizwan", - "name":"Raza, Rizwan", - "familyName":"Raza", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Qinghua", - "name":"Liu, Qinghua", - "familyName":"Liu", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Jawad", - "name":"Nisar, Jawad", - "familyName":"Nisar", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Xiaodi", - "name":"Wang, Xiaodi", - "familyName":"Wang", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Ying", - "name":"Ma, Ying", - "familyName":"Ma", - "affiliation":[ - - ] - }, - { - "nameType":"Personal", - "givenName":"Bin", - "name":"Zhu, Bin", - "familyName":"Zhu", - "affiliation":[ - - ] - } - ] - }, - "type":"dois", - "id":"10.1016/j.elecom.2011.05.032", - "timestamp":1585225960 - } - } - ] - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/record.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/record.json deleted file mode 100644 index a79e7334fa..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/record.json +++ /dev/null @@ -1 +0,0 @@ -{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"references","subRelType":null,"relClass":"datacite","source":"50|f2123fce7e56c73dc8f1bf64ec59b477","target":"50|b618cbe39ba940a29993ac324e5f9621","collectedFrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/relation.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/relation.json deleted file mode 100644 index 3cca6e370f..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/relation.json +++ /dev/null @@ -1 +0,0 @@ -{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"IsReferencedBy","subRelType":null,"relClass":"datacite","source":"50|dedup_______::4f00e4f0e82bb4cbb35261478e55568e","target":"60|97519e00ee2cddfa1f5bcb5220429b8f","collectedfrom":[{"key":"dli_________::europe_pmc__","value":"Europe PMC","dataInfo":null}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/summary.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/summary.json deleted file mode 100644 index d9b7c43719..0000000000 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/summary.json +++ /dev/null @@ -1 +0,0 @@ -{"id":"50|dedup_______::4f00e4f0e82bb4cbb35261478e55568e","localIdentifier":[{"id":"16909284","type":"pbmid"},{"id":"10.1007/s00438-006-0155-3","type":"doi"}],"typology":"publication","title":["Effects of the Sabin-like mutations in domain V of the internal ribosome entry segment on translational efficiency of the Coxsackievirus B3.","Effects of the Sabin-like mutations in domain V of the internal ribosome entry segment on translational efficiency of the Coxsackievirus B3"],"author":["Ben M’hadheb-Gharbi Manel","Gharbi Jawhar","Paulous Sylvie","Brocard Michèle","Komaromva Anastasia","Aouni Mahjoub","M. Kean Katherine"],"date":[null,"2018-11-13","2006-08-14T15:43:22Z"],"subject":[],"publisher":null,"relatedPublications":1,"relatedDatasets":4,"relatedUnknown":0,"datasources":null,"abstract":"The domain V within the internal ribosome entry segment (IRES) of poliovirus (PV) is expected to be important in its own neurovirulence because it contains an attenuating mutation in each of the Sabin vaccine strains. In this study, we try to find out if the results observed in the case of Sabin vaccine strains of PV can be extrapolated to another virus belonging to the same genus of enteroviruses but with a different tropism. To test this hypothesis, we used the coxsackievirus B3 (CVB3), known to be the mo"}