deleted old scholix project

pull/124/head
Sandro La Bruzzo 3 years ago
parent 8535506c22
commit ed684874f2

@ -1,82 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-scholexplorer</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
</dependencies>
</project>

@ -1,121 +0,0 @@
package eu.dnetlib.dedup;
import static java.util.Collections.reverseOrder;
import static java.util.Map.Entry.comparingByValue;
import static java.util.stream.Collectors.toMap;
import static org.apache.commons.lang.StringUtils.endsWith;
import static org.apache.commons.lang.StringUtils.substringBefore;
import java.time.Year;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;
import eu.dnetlib.dhp.schema.oaf.Field;
public class DatePicker {
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
private static final String DATE_DEFAULT_SUFFIX = "01-01";
private static final int YEAR_LB = 1300;
private static final int YEAR_UB = Year.now().getValue() + 5;
public static Field<String> pick(final Collection<String> dateofacceptance) {
final Map<String, Integer> frequencies = dateofacceptance
.parallelStream()
.filter(StringUtils::isNotBlank)
.collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum));
if (frequencies.isEmpty()) {
return new Field<>();
}
final Field<String> date = new Field<>();
date.setValue(frequencies.keySet().iterator().next());
// let's sort this map by values first, filtering out invalid dates
final Map<String, Integer> sorted = frequencies
.entrySet()
.stream()
.filter(d -> StringUtils.isNotBlank(d.getKey()))
.filter(d -> d.getKey().matches(DATE_PATTERN))
.filter(d -> inRange(d.getKey()))
.sorted(reverseOrder(comparingByValue()))
.collect(
toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
// shortcut
if (sorted.size() == 0) {
return date;
}
// voting method (1/3 + 1) wins
if (sorted.size() >= 3) {
final int acceptThreshold = (sorted.size() / 3) + 1;
final List<String> accepted = sorted
.entrySet()
.stream()
.filter(e -> e.getValue() >= acceptThreshold)
.map(e -> e.getKey())
.collect(Collectors.toList());
// cannot find strong majority
if (accepted.isEmpty()) {
final int max = sorted.values().iterator().next();
Optional<String> first = sorted
.entrySet()
.stream()
.filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
.map(Map.Entry::getKey)
.findFirst();
if (first.isPresent()) {
date.setValue(first.get());
return date;
}
date.setValue(sorted.keySet().iterator().next());
return date;
}
if (accepted.size() == 1) {
date.setValue(accepted.get(0));
return date;
} else {
final Optional<String> first = accepted
.stream()
.filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX))
.findFirst();
if (first.isPresent()) {
date.setValue(first.get());
return date;
}
return date;
}
// 1st non YYYY-01-01 is returned
} else {
if (sorted.size() == 2) {
for (Map.Entry<String, Integer> e : sorted.entrySet()) {
if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
date.setValue(e.getKey());
return date;
}
}
}
// none of the dates seems good enough, return the 1st one
date.setValue(sorted.keySet().iterator().next());
return date;
}
}
private static boolean inRange(final String date) {
final int year = Integer.parseInt(substringBefore(date, "-"));
return year >= YEAR_LB && year <= YEAR_UB;
}
}

@ -1,327 +0,0 @@
package eu.dnetlib.dedup;
import java.util.Collection;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
public class DedupRecordFactory {
public static JavaRDD<OafEntity> createDedupRecord(
final JavaSparkContext sc,
final SparkSession spark,
final String mergeRelsInputPath,
final String entitiesInputPath,
final OafEntityType entityType,
final DedupConfig dedupConf) {
long ts = System.currentTimeMillis();
// <id, json_entity>
final JavaPairRDD<String, String> inputJsonEntities = spark
.read()
.load(entitiesInputPath)
.as(Encoders.kryo(Oaf.class))
.map(
(MapFunction<Oaf, String>) p -> new org.codehaus.jackson.map.ObjectMapper().writeValueAsString(p),
Encoders.STRING())
.javaRDD()
.mapToPair(
(PairFunction<String, String, String>) it -> new Tuple2<>(
MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it));
// <source, target>: source is the dedup_id, target is the id of the mergedIn
JavaPairRDD<String, String> mergeRels = spark
.read()
.load(mergeRelsInputPath)
.as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.mapToPair(
(PairFunction<Relation, String, String>) r -> new Tuple2<String, String>(r.getTarget(), r.getSource()));
// <dedup_id, json_entity_merged>
final JavaPairRDD<String, String> joinResult = mergeRels
.join(inputJsonEntities)
.mapToPair(
(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
JavaPairRDD<String, Iterable<String>> sortedJoinResult = joinResult.groupByKey();
switch (entityType) {
case publication:
return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts));
case dataset:
return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts));
case project:
return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts));
case software:
return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts));
case datasource:
return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts));
case organization:
return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts));
case otherresearchproduct:
return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts));
default:
return null;
}
}
private static DLIPublication publicationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
DLIPublication p = new DLIPublication(); // the result of the merge, to be returned at the end
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e
._2()
.forEach(
pub -> {
try {
DLIPublication publication = mapper.readValue(pub, DLIPublication.class);
p.mergeFrom(publication);
p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
// add to the list if they are not null
if (publication.getDateofacceptance() != null)
dateofacceptance.add(publication.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
p.setDateofacceptance(DatePicker.pick(dateofacceptance));
if (p.getDataInfo() == null)
p.setDataInfo(new DataInfo());
p.getDataInfo().setTrust("0.9");
p.setLastupdatetimestamp(ts);
return p;
}
private static DLIDataset datasetMerger(Tuple2<String, Iterable<String>> e, final long ts) {
DLIDataset d = new DLIDataset(); // the result of the merge, to be returned at the end
d.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e
._2()
.forEach(
dat -> {
try {
Dataset dataset = mapper.readValue(dat, Dataset.class);
d.mergeFrom(dataset);
d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor()));
// add to the list if they are not null
if (dataset.getDateofacceptance() != null)
dateofacceptance.add(dataset.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
d.setDateofacceptance(DatePicker.pick(dateofacceptance));
if (d.getDataInfo() == null)
d.setDataInfo(new DataInfo());
d.getDataInfo().setTrust("0.9");
d.setLastupdatetimestamp(ts);
return d;
}
private static Project projectMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Project p = new Project(); // the result of the merge, to be returned at the end
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
if (e._2() != null)
e
._2()
.forEach(
proj -> {
try {
Project project = mapper.readValue(proj, Project.class);
p.mergeFrom(project);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (p.getDataInfo() == null)
p.setDataInfo(new DataInfo());
p.getDataInfo().setTrust("0.9");
p.setLastupdatetimestamp(ts);
return p;
}
private static Software softwareMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Software s = new Software(); // the result of the merge, to be returned at the end
s.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e
._2()
.forEach(
soft -> {
try {
Software software = mapper.readValue(soft, Software.class);
s.mergeFrom(software);
s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor()));
// add to the list if they are not null
if (software.getDateofacceptance() != null)
dateofacceptance.add(software.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
s.setDateofacceptance(DatePicker.pick(dateofacceptance));
if (s.getDataInfo() == null)
s.setDataInfo(new DataInfo());
s.getDataInfo().setTrust("0.9");
s.setLastupdatetimestamp(ts);
return s;
}
private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Datasource d = new Datasource(); // the result of the merge, to be returned at the end
d.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
if (e._2() != null)
e
._2()
.forEach(
dat -> {
try {
Datasource datasource = mapper.readValue(dat, Datasource.class);
d.mergeFrom(datasource);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (d.getDataInfo() == null)
d.setDataInfo(new DataInfo());
d.getDataInfo().setTrust("0.9");
d.setLastupdatetimestamp(ts);
return d;
}
private static Organization organizationMerger(
Tuple2<String, Iterable<String>> e, final long ts) {
Organization o = new Organization(); // the result of the merge, to be returned at the end
o.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
StringBuilder trust = new StringBuilder("0.0");
if (e._2() != null)
e
._2()
.forEach(
pub -> {
try {
Organization organization = mapper.readValue(pub, Organization.class);
final String currentTrust = organization.getDataInfo().getTrust();
if (!"1.0".equals(currentTrust)) {
trust.setLength(0);
trust.append(currentTrust);
}
o.mergeFrom(organization);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (o.getDataInfo() == null) {
o.setDataInfo(new DataInfo());
}
if (o.getDataInfo() == null)
o.setDataInfo(new DataInfo());
o.getDataInfo().setTrust("0.9");
o.setLastupdatetimestamp(ts);
return o;
}
private static OtherResearchProduct otherresearchproductMerger(
Tuple2<String, Iterable<String>> e, final long ts) {
OtherResearchProduct o = new OtherResearchProduct(); // the result of the merge, to be
// returned at the end
o.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e
._2()
.forEach(
orp -> {
try {
OtherResearchProduct otherResearchProduct = mapper
.readValue(orp, OtherResearchProduct.class);
o.mergeFrom(otherResearchProduct);
o
.setAuthor(
DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor()));
// add to the list if they are not null
if (otherResearchProduct.getDateofacceptance() != null)
dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (o.getDataInfo() == null)
o.setDataInfo(new DataInfo());
o.setDateofacceptance(DatePicker.pick(dateofacceptance));
o.getDataInfo().setTrust("0.9");
o.setLastupdatetimestamp(ts);
return o;
}
}

@ -1,239 +0,0 @@
package eu.dnetlib.dedup;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator;
import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person;
import scala.Tuple2;
public class DedupUtility {
private static final Double THRESHOLD = 0.95;
public static Map<String, LongAccumulator> constructAccumulator(
final DedupConfig dedupConf, final SparkContext context) {
Map<String, LongAccumulator> accumulators = new HashMap<>();
String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
accumulators.put(acc1, context.longAccumulator(acc1));
String acc2 = String
.format(
"%s::%s",
dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
accumulators.put(acc2, context.longAccumulator(acc2));
String acc3 = String
.format(
"%s::%s",
dedupConf.getWf().getEntityType(),
String
.format(
"Skipped records for count(%s) >= %s",
dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
accumulators.put(acc3, context.longAccumulator(acc3));
String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
accumulators.put(acc4, context.longAccumulator(acc4));
String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
accumulators.put(acc5, context.longAccumulator(acc5));
String acc6 = String
.format(
"%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
accumulators.put(acc6, context.longAccumulator(acc6));
return accumulators;
}
public static JavaRDD<String> loadDataFromHDFS(String path, JavaSparkContext context) {
return context.textFile(path);
}
public static void deleteIfExists(String path) throws IOException {
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(conf);
if (fileSystem.exists(new Path(path))) {
fileSystem.delete(new Path(path), true);
}
}
public static DedupConfig loadConfigFromHDFS(String path) throws IOException {
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(conf);
FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path)));
return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name()));
}
static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
}
public static String md5(final String s) {
try {
final MessageDigest md = MessageDigest.getInstance("MD5");
md.update(s.getBytes(StandardCharsets.UTF_8));
return new String(Hex.encodeHex(md.digest()));
} catch (final Exception e) {
System.err.println("Error creating id");
return null;
}
}
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
int pa = countAuthorsPids(a);
int pb = countAuthorsPids(b);
List<Author> base, enrich;
int sa = authorsSize(a);
int sb = authorsSize(b);
if (pa == pb) {
base = sa > sb ? a : b;
enrich = sa > sb ? b : a;
} else {
base = pa > pb ? a : b;
enrich = pa > pb ? b : a;
}
enrichPidFromList(base, enrich);
return base;
}
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
if (base == null || enrich == null)
return;
final Map<String, Author> basePidAuthorMap = base
.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a)))
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.flatMap(
a -> a
.getPid()
.stream()
.filter(p -> !basePidAuthorMap.containsKey(p.toComparableString()))
.map(p -> new Tuple2<>(p, a)))
.collect(Collectors.toList());
pidToEnrich
.forEach(
a -> {
Optional<Tuple2<Double, Author>> simAuhtor = base
.stream()
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
.max(Comparator.comparing(Tuple2::_1));
if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
Author r = simAuhtor.get()._2();
r.getPid().add(a._1());
}
});
}
public static String createEntityPath(final String basePath, final String entityType) {
return String.format("%s/%s", basePath, entityType);
}
public static String createSimRelPath(final String basePath, final String entityType) {
return String.format("%s/%s/simRel", basePath, entityType);
}
public static String createMergeRelPath(final String basePath, final String entityType) {
return String.format("%s/%s/mergeRel", basePath, entityType);
}
private static Double sim(Author a, Author b) {
final Person pa = parse(a);
final Person pb = parse(b);
if (pa.isAccurate() & pb.isAccurate()) {
return new JaroWinkler()
.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
} else {
return new JaroWinkler()
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
}
}
private static String normalize(final String s) {
return nfd(s)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError
// in case
// of large input strings
.replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
private static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
private static Person parse(Author author) {
if (StringUtils.isNotBlank(author.getSurname())) {
return new Person(author.getSurname() + ", " + author.getName(), false);
} else {
return new Person(author.getFullname(), false);
}
}
private static int countAuthorsPids(List<Author> authors) {
if (authors == null)
return 0;
return (int) authors.stream().filter(DedupUtility::hasPid).count();
}
private static int authorsSize(List<Author> authors) {
if (authors == null)
return 0;
return authors.size();
}
private static boolean hasPid(Author a) {
if (a == null || a.getPid() == null || a.getPid().size() == 0)
return false;
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
}
}

@ -1,182 +0,0 @@
package eu.dnetlib.dedup;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Serializable;
import scala.Tuple2;
public class Deduper implements Serializable {
private static final Log log = LogFactory.getLog(Deduper.class);
/**
* @return the list of relations generated by the deduplication
* @param: the spark context
* @param: list of JSON entities to be deduped
* @param: the dedup configuration
*/
public static JavaPairRDD<String, String> dedup(
JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
// create vertexes of the graph: <ID, MapDocument>
JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
// create blocks for deduplication
JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
// create relations by comparing only elements in the same group
return computeRelations(context, blocks, config);
// final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new
// Edge<>(it._1().hashCode(),
// it._2().hashCode(), "equalTo")).rdd();
//
// RDD<Tuple2<Object, MapDocument>> vertexes =
// mapDocs.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t ->
// new
// Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2())).rdd();
// accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
//
// return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
}
/**
* @return the list of relations generated by the deduplication
* @param: the spark context
* @param: list of blocks
* @param: the dedup configuration
*/
public static JavaPairRDD<String, String> computeRelations(
JavaSparkContext context,
JavaPairRDD<String, Iterable<MapDocument>> blocks,
DedupConfig config) {
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
return blocks
.flatMapToPair(
(PairFlatMapFunction<Tuple2<String, Iterable<MapDocument>>, String, String>) it -> {
final SparkReporter reporter = new SparkReporter(accumulators);
new BlockProcessor(config).process(it._1(), it._2(), reporter);
return reporter.getRelations().iterator();
})
.mapToPair(
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item -> new Tuple2<String, Tuple2<String, String>>(
item._1() + item._2(), item))
.reduceByKey((a, b) -> a)
.mapToPair(
(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
}
/**
* @return the list of blocks based on clustering of dedup configuration
* @param: the spark context
* @param: list of entities: <id, entity>
* @param: the dedup configuration
*/
public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(
JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
return mapDocs
// the reduce is just to be sure that we haven't document with same id
.reduceByKey((a, b) -> a)
.map(Tuple2::_2)
// Clustering: from <id, doc> to List<groupkey,doc>
.flatMapToPair(
(PairFlatMapFunction<MapDocument, String, MapDocument>) a -> DedupUtility
.getGroupingKeys(config, a)
.stream()
.map(it -> new Tuple2<>(it, a))
.collect(Collectors.toList())
.iterator())
.groupByKey();
}
public static JavaPairRDD<String, List<MapDocument>> createsortedBlocks(
JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
final String of = config.getWf().getOrderField();
final int maxQueueSize = config.getWf().getGroupMaxSize();
return mapDocs
// the reduce is just to be sure that we haven't document with same id
.reduceByKey((a, b) -> a)
.map(Tuple2::_2)
// Clustering: from <id, doc> to List<groupkey,doc>
.flatMapToPair(
(PairFlatMapFunction<MapDocument, String, List<MapDocument>>) a -> DedupUtility
.getGroupingKeys(config, a)
.stream()
.map(
it -> {
List<MapDocument> tmp = new ArrayList<>();
tmp.add(a);
return new Tuple2<>(it, tmp);
})
.collect(Collectors.toList())
.iterator())
.reduceByKey(
(Function2<List<MapDocument>, List<MapDocument>, List<MapDocument>>) (v1, v2) -> {
v1.addAll(v2);
v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue()));
if (v1.size() > maxQueueSize)
return new ArrayList<>(v1.subList(0, maxQueueSize));
return v1;
});
}
/**
* @return the list of vertexes: <id, mapDocument>
* @param: the spark context
* @param: list of JSON entities
* @param: the dedup configuration
*/
public static JavaPairRDD<String, MapDocument> mapToVertexes(
JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
return entities
.mapToPair(
(PairFunction<String, String, MapDocument>) s -> {
MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s);
return new Tuple2<String, MapDocument>(mapDocument.getIdentifier(), mapDocument);
});
}
public static JavaPairRDD<String, String> computeRelations2(
JavaSparkContext context, JavaPairRDD<String, List<MapDocument>> blocks, DedupConfig config) {
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
return blocks
.flatMapToPair(
(PairFlatMapFunction<Tuple2<String, List<MapDocument>>, String, String>) it -> {
try {
final SparkReporter reporter = new SparkReporter(accumulators);
new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter);
return reporter.getRelations().iterator();
} catch (Exception e) {
throw new RuntimeException(it._2().get(0).getIdentifier(), e);
}
})
.mapToPair(
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item -> new Tuple2<String, Tuple2<String, String>>(
item._1() + item._2(), item))
.reduceByKey((a, b) -> a)
.mapToPair(
(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
}
}

@ -1,6 +0,0 @@
package eu.dnetlib.dedup;
public enum OafEntityType {
datasource, organization, project, dataset, otherresearchproduct, software, publication
}

@ -1,112 +0,0 @@
package eu.dnetlib.dedup;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.hash.Hashing;
import eu.dnetlib.dedup.graph.ConnectedComponent;
import eu.dnetlib.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
public class SparkCreateConnectedComponent {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCreateConnectedComponent.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkCreateConnectedComponent.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final String inputPath = parser.get("sourcePath");
final String entity = parser.get("entity");
final String targetPath = parser.get("targetPath");
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final JavaPairRDD<Object, String> vertexes = spark
.read()
.load(inputPath + "/" + entity)
.as(Encoders.kryo(Oaf.class))
.map((MapFunction<Oaf, String>) p -> new ObjectMapper().writeValueAsString(p), Encoders.STRING())
.javaRDD()
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair(
(PairFunction<String, Object, String>) s -> new Tuple2<Object, String>(getHashcode(s), s));
final Dataset<Relation> similarityRelations = spark
.read()
.load(DedupUtility.createSimRelPath(targetPath, entity))
.as(Encoders.bean(Relation.class));
final RDD<Edge<String>> edgeRdd = similarityRelations
.javaRDD()
.map(
it -> new Edge<>(
getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass()))
.rdd();
final JavaRDD<ConnectedComponent> cc = GraphProcessor
.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
.toJavaRDD();
final Dataset<Relation> mergeRelation = spark
.createDataset(
cc
.filter(k -> k.getDocIds().size() > 1)
.flatMap(
(FlatMapFunction<ConnectedComponent, Relation>) c -> c
.getDocIds()
.stream()
.flatMap(
id -> {
List<Relation> tmp = new ArrayList<>();
Relation r = new Relation();
r.setSource(c.getCcId());
r.setTarget(id);
r.setRelClass(ModelConstants.MERGES);
tmp.add(r);
r = new Relation();
r.setTarget(c.getCcId());
r.setSource(id);
r.setRelClass(ModelConstants.IS_MERGED_IN);
tmp.add(r);
return tmp.stream();
})
.iterator())
.rdd(),
Encoders.bean(Relation.class));
mergeRelation
.write()
.mode("overwrite")
.save(DedupUtility.createMergeRelPath(targetPath, entity));
}
public static long getHashcode(final String id) {
return Hashing.murmur3_128().hashString(id).asLong();
}
}

@ -1,59 +0,0 @@
package eu.dnetlib.dedup;
import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.pace.config.DedupConfig;
public class SparkCreateDedupRecord {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCreateDedupRecord.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkCreateDedupRecord.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String sourcePath = parser.get("sourcePath");
final String entity = parser.get("entity");
final String dedupPath = parser.get("dedupPath");
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory
.createDedupRecord(
sc,
spark,
DedupUtility.createMergeRelPath(dedupPath, entity),
DedupUtility.createEntityPath(sourcePath, entity),
OafEntityType.valueOf(entity),
dedupConf);
spark
.createDataset(dedupRecord.rdd(), Encoders.kryo(OafEntity.class))
.write()
.mode(SaveMode.Overwrite)
.save(dedupPath + "/" + entity + "/dedup_records");
//
//
// dedupRecord
// .map(
// r -> {
// ObjectMapper mapper = new ObjectMapper();
// return mapper.writeValueAsString(r);
// })
// .saveAsTextFile(dedupPath + "/" + entity + "/dedup_records");
}
}

@ -1,92 +0,0 @@
package eu.dnetlib.dedup;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
/**
* This Spark class creates similarity relations between entities, saving result
* <p>
* param request: sourcePath entityType target Path
*/
public class SparkCreateSimRels {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCreateSimRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkCreateSimRels.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String entity = parser.get("entity");
final String targetPath = parser.get("targetPath");
// final DedupConfig dedupConf =
// DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
JavaPairRDD<String, MapDocument> mapDocument = spark
.read()
.load(inputPath + "/" + entity)
.as(Encoders.kryo(Oaf.class))
.map((MapFunction<Oaf, String>) p -> new ObjectMapper().writeValueAsString(p), Encoders.STRING())
.javaRDD()
.repartition(1000)
.mapToPair(
s -> {
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
return new Tuple2<>(d.getIdentifier(), d);
});
// create blocks for deduplication
JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
// JavaPairRDD<String, Iterable<MapDocument>> blocks = Deduper.createBlocks(sc,
// mapDocument, dedupConf);
// create relations by comparing only elements in the same group
final JavaPairRDD<String, String> dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
// final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations(sc, blocks,
// dedupConf);
final JavaRDD<Relation> isSimilarToRDD = dedupRels
.map(
simRel -> {
final Relation r = new Relation();
r.setSource(simRel._1());
r.setTarget(simRel._2());
r.setRelClass("isSimilarTo");
return r;
});
spark
.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class))
.write()
.mode("overwrite")
.save(DedupUtility.createSimRelPath(targetPath, entity));
}
}

@ -1,52 +0,0 @@
package eu.dnetlib.dedup;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.pace.util.Reporter;
import scala.Serializable;
import scala.Tuple2;
public class SparkReporter implements Serializable, Reporter {
final List<Tuple2<String, String>> relations = new ArrayList<>();
private static final Log log = LogFactory.getLog(SparkReporter.class);
Map<String, LongAccumulator> accumulators;
public SparkReporter(Map<String, LongAccumulator> accumulators) {
this.accumulators = accumulators;
}
public void incrementCounter(
String counterGroup,
String counterName,
long delta,
Map<String, LongAccumulator> accumulators) {
final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
if (accumulators.containsKey(accumulatorName)) {
accumulators.get(accumulatorName).add(delta);
}
}
@Override
public void incrementCounter(String counterGroup, String counterName, long delta) {
incrementCounter(counterGroup, counterName, delta, accumulators);
}
@Override
public void emit(String type, String from, String to) {
relations.add(new Tuple2<>(from, to));
}
public List<Tuple2<String, String>> getRelations() {
return relations;
}
}

@ -1,84 +0,0 @@
package eu.dnetlib.dedup.graph;
import java.io.IOException;
import java.io.Serializable;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dedup.DedupUtility;
import eu.dnetlib.pace.util.PaceException;
public class ConnectedComponent implements Serializable {
private Set<String> docIds;
private String ccId;
public ConnectedComponent() {
}
public ConnectedComponent(Set<String> docIds) {
this.docIds = docIds;
createID();
}
public String createID() {
if (docIds.size() > 1) {
final String s = getMin();
String prefix = s.split("\\|")[0];
ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s);
return ccId;
} else {
return docIds.iterator().next();
}
}
@JsonIgnore
public String getMin() {
final StringBuilder min = new StringBuilder();
docIds
.forEach(
i -> {
if (StringUtils.isBlank(min.toString())) {
min.append(i);
} else {
if (min.toString().compareTo(i) > 0) {
min.setLength(0);
min.append(i);
}
}
});
return min.toString();
}
@Override
public String toString() {
ObjectMapper mapper = new ObjectMapper();
try {
return mapper.writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Failed to create Json: ", e);
}
}
public Set<String> getDocIds() {
return docIds;
}
public void setDocIds(Set<String> docIds) {
this.docIds = docIds;
}
public String getCcId() {
return ccId;
}
public void setCcId(String ccId) {
this.ccId = ccId;
}
}

@ -1,37 +0,0 @@
package eu.dnetlib.dedup.graph
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import scala.collection.JavaConversions;
object GraphProcessor {
def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
val cc = graph.connectedComponents(maxIterations).vertices
val joinResult = vertexes.leftOuterJoin(cc).map {
case (id, (openaireId, cc)) => {
if (cc.isEmpty) {
(id, openaireId)
}
else {
(cc.get, openaireId)
}
}
}
val connectedComponents = joinResult.groupByKey()
.map[ConnectedComponent](cc => asConnectedComponent(cc))
connectedComponents
}
def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
val docs = group._2.toSet[String]
val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
connectedComponent
}
}

@ -1,78 +0,0 @@
package eu.dnetlib.dedup.sx;
import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils;
import scala.Tuple2;
public class SparkPropagateRelationsJob {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkPropagateRelationsJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkUpdateEntityJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final String relationPath = parser.get("relationPath");
final String mergeRelPath = parser.get("mergeRelPath");
final String targetRelPath = parser.get("targetRelPath");
final Dataset<Relation> merge = spark
.read()
.load(mergeRelPath)
.as(Encoders.bean(Relation.class))
.where("relClass == 'merges'");
final Dataset<Relation> rels = spark
.read()
.load(relationPath)
.as(Encoders.kryo(Relation.class))
.map(
(MapFunction<Relation, Relation>) r -> r,
Encoders.bean(Relation.class));
final Dataset<Relation> firstJoin = rels
.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer")
.map(
(MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
final Relation mergeRelation = r._2();
final Relation relation = r._1();
if (mergeRelation != null)
relation.setSource(mergeRelation.getSource());
if (relation.getDataInfo() == null)
relation.setDataInfo(OafUtils.generateDataInfo("0.9", false));
return relation;
},
Encoders.bean(Relation.class));
final Dataset<Relation> secondJoin = firstJoin
.joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer")
.map(
(MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
final Relation mergeRelation = r._2();
final Relation relation = r._1();
if (mergeRelation != null)
relation.setTarget(mergeRelation.getSource());
return relation;
},
Encoders.kryo(Relation.class));
secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath);
}
}

@ -1,102 +0,0 @@
package eu.dnetlib.dedup.sx;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.*;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class SparkUpdateEntityJob {
static final String IDJSONPATH = "$.id";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkUpdateEntityJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkUpdateEntityJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String entityPath = parser.get("entityPath");
final String mergeRelPath = parser.get("mergeRelPath");
final String dedupRecordPath = parser.get("dedupRecordPath");
final String entity = parser.get("entity");
final String destination = parser.get("targetPath");
final Dataset<Relation> df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
final JavaPairRDD<String, String> mergedIds = df
.where("relClass == 'merges'")
.select(df.col("target"))
.distinct()
.toJavaRDD()
.mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "d"));
final JavaRDD<String> sourceEntity = sc.textFile(entityPath);
final JavaRDD<String> dedupEntity = sc.textFile(dedupRecordPath);
JavaPairRDD<String, String> entitiesWithId = sourceEntity
.mapToPair(
(PairFunction<String, String, String>) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s));
Class<? extends Oaf> mainClass;
switch (entity) {
case "publication":
mainClass = DLIPublication.class;
break;
case "dataset":
mainClass = DLIDataset.class;
break;
case "unknown":
mainClass = DLIUnknown.class;
break;
default:
throw new IllegalArgumentException("Illegal type " + entity);
}
JavaRDD<String> map = entitiesWithId
.leftOuterJoin(mergedIds)
.map(
k -> k._2()._2().isPresent()
? updateDeletedByInference(k._2()._1(), mainClass)
: k._2()._1());
map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class);
}
private static <T extends Oaf> String updateDeletedByInference(
final String json, final Class<T> clazz) {
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try {
Oaf entity = mapper.readValue(json, clazz);
if (entity.getDataInfo() == null)
entity.setDataInfo(new DataInfo());
entity.getDataInfo().setDeletedbyinference(true);
return mapper.writeValueAsString(entity);
} catch (IOException e) {
throw new RuntimeException("Unable to convert json", e);
}
}
}

@ -1,75 +0,0 @@
package eu.dnetlib.dedup.sx
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown, OafUtils}
import org.apache.commons.io.IOUtils
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.LoggerFactory
import org.apache.spark.sql.functions.col
object SparkUpdateEntityWithDedupInfo {
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityWithDedupInfo.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json")))
val logger = LoggerFactory.getLogger(SparkUpdateEntityWithDedupInfo.getClass)
parser.parseArgument(args)
val workingPath: String = parser.get("workingPath")
logger.info(s"Working dir path = $workingPath")
implicit val oafEncoder: Encoder[OafEntity] = Encoders.kryo[OafEntity]
implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation])
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
val spark: SparkSession = SparkSession
.builder()
.appName(SparkUpdateEntityWithDedupInfo.getClass.getSimpleName)
.master(parser.get("master"))
.getOrCreate()
val entityPath = parser.get("entityPath")
val mergeRelPath = parser.get("mergeRelPath")
val dedupRecordPath = parser.get("dedupRecordPath")
val entity = parser.get("entity")
val destination = parser.get("targetPath")
val mergedIds = spark.read.load(mergeRelPath).as[Relation]
.where("relClass == 'merges'")
.select(col("target"))
val entities: Dataset[(String, OafEntity)] = spark
.read
.load(entityPath).as[OafEntity]
.map(o => (o.getId, o))(Encoders.tuple(Encoders.STRING, oafEncoder))
val finalDataset:Dataset[OafEntity] = entities.joinWith(mergedIds, entities("_1").equalTo(mergedIds("target")), "left")
.map(k => {
val e: OafEntity = k._1._2
val t = k._2
if (t != null && t.getString(0).nonEmpty) {
if (e.getDataInfo == null) {
e.setDataInfo(OafUtils.generateDataInfo())
}
e.getDataInfo.setDeletedbyinference(true)
}
e
})
val dedupRecords :Dataset[OafEntity] = spark.read.load(dedupRecordPath).as[OafEntity]
finalDataset.union(dedupRecords)
.repartition(1200).write
.mode(SaveMode.Overwrite).save(destination)
}
}

@ -1,33 +0,0 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the path of the sequential file to read",
"paramRequired": true
},
{
"paramName": "e",
"paramLongName": "entity",
"paramDescription": "the type of entity to be deduped",
"paramRequired": true
},
{
"paramName": "c",
"paramLongName": "dedupConf",
"paramDescription": "dedup configuration to be used",
"compressed": true,
"paramRequired": true
},
{
"paramName": "d",
"paramLongName": "dedupPath",
"paramDescription": "dedup path to load mergeRelation",
"paramRequired": true
}
]

@ -1,38 +0,0 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "ep",
"paramLongName": "entityPath",
"paramDescription": "the input entity path",
"paramRequired": true
},
{
"paramName": "mr",
"paramLongName": "mergeRelPath",
"paramDescription": "the input path of merge Rel",
"paramRequired": true
},
{
"paramName": "dr",
"paramLongName": "dedupRecordPath",
"paramDescription": "the inputPath of dedup record",
"paramRequired": true
},
{
"paramName": "e",
"paramLongName": "entity",
"paramDescription": "the type of entity",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the targetPath",
"paramRequired": true
}
]

@ -1,33 +0,0 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the path of the sequential file to read",
"paramRequired": true
},
{
"paramName": "e",
"paramLongName": "entity",
"paramDescription": "the type of entity to be deduped",
"paramRequired": true
},
{
"paramName": "c",
"paramLongName": "dedupConf",
"paramDescription": "dedup configuration to be used",
"compressed": true,
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "target path to save dedup result",
"paramRequired": true
}
]

@ -1,26 +0,0 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "ep",
"paramLongName": "relationPath",
"paramDescription": "the input relation path",
"paramRequired": true
},
{
"paramName": "mr",
"paramLongName": "mergeRelPath",
"paramDescription": "the input path of merge Rel",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetRelPath",
"paramDescription": "the output Rel Path",
"paramRequired": true
}
]

@ -1,18 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -1,182 +0,0 @@
<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>entity</name>
<description>the entity that should be processed</description>
</property>
<property>
<name>dedupConf</name>
<description>the dedup Configuration</description>
</property>
<property>
<name>targetPath</name>
<description>the target path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
</parameters>
<start to="DeleteWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="DeleteWorkingPath">
<fs>
<delete path='${targetPath}/${entity}'/>
<mkdir path="${targetPath}"/>
<mkdir path="${targetPath}/${entity}"/>
</fs>
<ok to="CreateSimRels"/>
<error to="Kill"/>
</action>
<action name="CreateSimRels">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Similarity Relations</name>
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--executor-cores=${sparkExecutorCores}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateConnectedComponents"/>
<error to="Kill"/>
</action>
<action name="CreateConnectedComponents">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Connected Components</name>
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--executor-cores=${sparkExecutorCores}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateDedupRecord"/>
<error to="Kill"/>
</action>
<action name="CreateDedupRecord">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Dedup Record</name>
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--executor-cores=${sparkExecutorCores}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--dedupPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="fixRelation"/>
<error to="Kill"/>
</action>
<action name="fixRelation">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Propagate Dedup Relations</name>
<class>eu.dnetlib.dedup.sx.SparkPropagateRelationsJob</class>
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--executor-cores=${sparkExecutorCores}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
<arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
<arg>--targetRelPath</arg><arg>${targetPath}/${entity}/updated_relation</arg>
</spark>
<ok to="updateDeletedByInferenceEntity"/>
<error to="Kill"/>
</action>
<action name="updateDeletedByInferenceEntity">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Update ${entity} and add DedupRecord</name>
<class>eu.dnetlib.dedup.sx.SparkUpdateEntityWithDedupInfo</class>
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--executor-cores=${sparkExecutorCores}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--entityPath</arg><arg>${sourcePath}/${entity}</arg>
<arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupRecordPath</arg><arg>${targetPath}/${entity}/dedup_records</arg>
<arg>--targetPath</arg><arg>${targetPath}/${entity}/updated_record</arg>
</spark>
<ok to="replaceEntity"/>
<error to="Kill"/>
</action>
<action name="replaceEntity">
<fs>
<delete path='${sourcePath}/${entity}'/>
<delete path='${sourcePath}/relation'/>
<move source="${targetPath}/${entity}/updated_relation" target="${sourcePath}/relation" />
<move source="${targetPath}/${entity}/updated_record" target="${sourcePath}/${entity}" />
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -1,378 +0,0 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "result",
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "2000",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "200",
"rootBuilder": [
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering": [
{
"name": "ngrampairs",
"fields": [
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "pid",
"type": "JSON",
"path": "$.pid",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[*].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
}
],
"blacklists": {
"title": [
"^Inside Front Cover$",
"^CORR Insights$",
"^Index des notions$",
"^Department of Error.$",
"^Untitled Item$",
"^Department of Error$",
"^Tome II : 1598 à 1605$",
"^(à lexception de roi, prince, royauté, pouvoir, image… qui sont omniprésents)$",
"^Museen und Ausstellungsinstitute in Nürnberg$",
"^Text/Conference Paper$",
"^Table des illustrations$",
"^An Intimate Insight on Psychopathy and a Novel Hermeneutic Psychological Science$",
"^Index des noms$",
"^Reply by Authors.$",
"^Titelblatt - Inhalt$",
"^Index des œuvres,$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\\!?\\:?$",
"^Chronic fatigue syndrome\\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$"
]
},
"synonyms": {}
}
}

@ -1,79 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-graph-provision-scholexplorer</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
</dependencies>
</project>

@ -1,425 +0,0 @@
package eu.dnetlib.dhp.export
import com.fasterxml.jackson.databind.ObjectMapper
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import eu.dnetlib.dhp.common.PacePerson
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import scala.collection.JavaConverters._
case class DLIExternalReference(id: String, url: String, sitename: String, label: String, pid: String, classId: String) {}
object DLIToOAF {
val collectedFromMap: Map[String, KeyValue] = Map(
"dli_________::r3d100010527" -> generateKeyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive"),
"dli_________::r3d100010255" -> generateKeyValue("10|re3data_____::480d275ed6f9666ee76d6a1215eabf26", "Inter-university Consortium for Political and Social Research"),
"dli_________::r3d100011868" -> generateKeyValue("10|re3data_____::db814dc656a911b556dba42a331cebe9", "Mendeley Data"),
"dli_________::elsevier" -> generateKeyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier"),
"dli_________::openaire" -> generateKeyValue("10|infrastruct_::f66f1bd369679b5b077dcdf006089556", "OpenAIRE"),
"dli_________::thomsonreuters" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref"),
"dli_________::r3d100010216" -> generateKeyValue("10|re3data_____::0fd79429de04343dbbec705d9b5f429f", "4TU.Centre for Research Data"),
"dli_________::r3d100010134" -> generateKeyValue("10|re3data_____::9633d1e8c4309c833c2c442abeb0cfeb", "PANGAEA"),
"dli_________::ieee" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref"),
"dli_________::r3d100010197" -> generateKeyValue("10|re3data_____::9fd1d79973f7fda60cbe1d82e3819a68", "The Cambridge Structural Database"),
"dli_________::nature" -> generateKeyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature"),
"dli_________::datacite" -> generateKeyValue("10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite"),
"dli_________::r3d100010578" -> generateKeyValue("10|re3data_____::c4d751f29a7568011a4c80136b30b444", "IEDA"),
"dli_________::r3d100010464" -> generateKeyValue("10|re3data_____::23e2a81591099828f6b83a1c83150666", "Research Data Australia"),
"dli_________::r3d100010327" -> generateKeyValue("10|re3data_____::a644620b81135243dc9acc15d2362246", "Worldwide Protein Data Bank"),
"dli_________::pubmed" -> generateKeyValue("10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357", "PubMed Central"),
"dli_________::europe_pmc__" -> generateKeyValue("10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", "Europe PubMed Central"),
"dli_________::crossref" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref")
)
val relationTypeMapping: Map[String, (String, String)] = Map(
"IsReferencedBy" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"References" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"IsRelatedTo" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"IsSupplementedBy" -> (ModelConstants.IS_SUPPLEMENTED_BY, ModelConstants.SUPPLEMENT),
"Documents" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"Cites" -> (ModelConstants.CITES, ModelConstants.CITATION),
"Unknown" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"IsSourceOf" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"IsCitedBy" -> (ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
"Reviews" -> (ModelConstants.REVIEWS, ModelConstants.REVIEW),
"Describes" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"HasAssociationWith" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP)
)
val expectecdPidType = List("uniprot", "ena", "chembl", "ncbi-n", "ncbi-p", "genbank", "pdb", "url")
val filteredURL = List(
"www.ebi.ac.uk",
"www.uniprot.org",
"f1000.com",
"en.wikipedia.org",
"flybase.org",
"www.yeastgenome.org",
"research.bioinformatics.udel.edu",
"cancer.sanger.ac.uk",
"www.iedb.org",
"www.crd.york.ac.uk",
"www.wormbase.org",
"web.expasy.org",
"www.hal.inserm.fr",
"sabiork.h-its.org",
"zfin.org",
"www.pombase.org",
"www.guidetopharmacology.org",
"reactome.org"
)
val rel_inverse: Map[String, String] = Map(
ModelConstants.IS_RELATED_TO -> ModelConstants.IS_RELATED_TO,
ModelConstants.IS_SUPPLEMENTED_BY -> ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.CITES -> ModelConstants.IS_CITED_BY,
ModelConstants.IS_CITED_BY -> ModelConstants.CITES,
ModelConstants.REVIEWS -> ModelConstants.IS_REVIEWED_BY
)
val PidTypeMap: Map[String, String] = Map(
"pbmid" -> "pmid",
"pmcid" -> "pmc",
"pmid" -> "pmid",
"pubmedid" -> "pmid",
"DOI" -> "doi",
"doi" -> "doi"
)
def fixInstance(r:Publication) :Publication = {
val collectedFrom = r.getCollectedfrom.asScala.head
r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
r
}
def fixInstanceDataset(r:Dataset) :Dataset = {
val collectedFrom = r.getCollectedfrom.asScala.head
r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
r
}
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: Dataset =>
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
a.setClazz(classOf[Dataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def convertClinicalTrial(dataset: DLIDataset): (String, String) = {
val currentId = generateId(dataset.getId)
val pids = dataset.getPid.asScala.filter(p => "clinicaltrials.gov".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => s"50|r3111dacbab5::${DHPUtils.md5(p.getValue.toLowerCase())}")
if (pids.isEmpty)
null
else
(currentId, pids.head)
}
def insertExternalRefs(publication: Publication, externalReferences: List[DLIExternalReference]): Publication = {
val eRefs = externalReferences.map(e => {
val result = new ExternalReference()
result.setSitename(e.sitename)
result.setLabel(e.label)
result.setUrl(e.url)
result.setRefidentifier(e.pid)
result.setDataInfo(generateDataInfo())
result.setQualifier(createQualifier(e.classId, ModelConstants.DNET_EXTERNAL_REFERENCE_TYPE))
result
})
publication.setExternalReference(eRefs.asJava)
publication
}
def filterPid(p: StructuredProperty): Boolean = {
if (expectecdPidType.contains(p.getQualifier.getClassname) && p.getQualifier.getClassname.equalsIgnoreCase("url"))
if (filteredURL.exists(u => p.getValue.contains(u)))
return true
else
return false
expectecdPidType.contains(p.getQualifier.getClassname)
}
def extractTitle(titles: java.util.List[StructuredProperty]): String = {
if (titles == null)
return null
val label = titles.asScala.map(p => p.getValue).find(p => p.nonEmpty)
label.orNull
}
def convertDLIDatasetToExternalReference(dataset: DLIDataset): DLIExternalReference = {
val pids = dataset.getPid.asScala.filter(filterPid)
if (pids == null || pids.isEmpty)
return null
val pid: StructuredProperty = pids.head
pid.getQualifier.getClassname match {
case "uniprot" => DLIExternalReference(generateId(dataset.getId), s"https://www.uniprot.org/uniprot/${pid.getValue}", "UniProt", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "ena" =>
if (pid.getValue != null && pid.getValue.nonEmpty && pid.getValue.length > 7)
DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/ena/data/view/${pid.getValue.substring(0, 8)}", "European Nucleotide Archive", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
else
null
case "chembl" => DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/chembl/compound_report_card/${pid.getValue}", "ChEMBL", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "ncbi-n" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Nucleotide Database", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "ncbi-p" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Nucleotide Database", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "genbank" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "GenBank", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "pdb" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Protein Data Bank", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "url" => DLIExternalReference(generateId(dataset.getId), pid.getValue, "", extractTitle(dataset.getTitle), pid.getValue, "url")
}
}
def convertDLIPublicationToOAF(inputPublication: DLIPublication): Publication = {
val result = new Publication
val cleanedPids = inputPublication.getPid.asScala.filter(p => PidTypeMap.contains(p.getQualifier.getClassid))
.map(p => {
p.setQualifier(createQualifier(PidTypeMap(p.getQualifier.getClassid), p.getQualifier.getSchemeid))
p
})
if (cleanedPids.isEmpty)
return null
result.setId(generateId(inputPublication.getId))
result.setDataInfo(generateDataInfo(invisible = true))
if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null))
return null
result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
if(result.getCollectedfrom.isEmpty)
return null
result.setPid(cleanedPids.asJava)
result.setDateofcollection(inputPublication.getDateofcollection)
result.setOriginalId(inputPublication.getPid.asScala.map(p => p.getValue).asJava)
result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty)
return null
result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava)
result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES))
if (inputPublication.getSubject != null)
result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava)
if (inputPublication.getTitle == null || inputPublication.getTitle.isEmpty)
return null
result.setTitle(List(patchTitle(inputPublication.getTitle.get(0))).asJava)
if (inputPublication.getRelevantdate == null || inputPublication.getRelevantdate.size() == 0)
return null
result.setRelevantdate(inputPublication.getRelevantdate.asScala.map(patchRelevantDate).asJava)
result.setDescription(inputPublication.getDescription)
result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue))
result.setPublisher(inputPublication.getPublisher)
result.setSource(inputPublication.getSource)
result.setBestaccessright(createAccessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue)
if (dois.isEmpty)
return null
val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(inputPublication.getInstance()), result.getDateofacceptance)
if (i != null)
result.setInstance(List(i).asJava)
result
}
def convertDLIRelation(r: Relation): Relation = {
val rt = r.getRelType
if (!relationTypeMapping.contains(rt))
return null
r.setRelType(ModelConstants.RESULT_RESULT)
r.setRelClass(relationTypeMapping(rt)._1)
r.setSubRelType(relationTypeMapping(rt)._2)
r.setSource(generateId(r.getSource))
r.setTarget(generateId(r.getTarget))
r
}
def convertDLIDatasetTOOAF(d: DLIDataset): Dataset = {
if (d.getCollectedfrom == null || d.getCollectedfrom.size() == 0 || (d.getCollectedfrom.size() == 1 && d.getCollectedfrom.get(0) == null))
return null
val result: Dataset = new Dataset
result.setId(generateId(d.getId))
result.setDataInfo(generateDataInfo())
result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
if(result.getCollectedfrom.isEmpty)
return null
result.setPid(d.getPid)
val fpids = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname) ||
"pdb".equalsIgnoreCase(p.getQualifier.getClassname)
).map(p => p.getValue)
if (fpids == null || fpids.isEmpty)
return null
result.setDateofcollection(d.getDateofcollection)
result.setOriginalId(d.getPid.asScala.map(d => d.getValue).asJava)
result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
if (d.getAuthor == null || d.getAuthor.isEmpty)
return null
result.setAuthor(d.getAuthor.asScala.map(convertAuthor).asJava)
result.setResulttype(createQualifier(d.getResulttype.getClassid, d.getResulttype.getClassname, ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES))
if (d.getSubject != null)
result.setSubject(d.getSubject.asScala.map(convertSubject).asJava)
if (d.getTitle == null || d.getTitle.isEmpty)
return null
result.setTitle(List(patchTitle(d.getTitle.get(0))).asJava)
if (d.getRelevantdate == null || d.getRelevantdate.size() == 0)
return null
result.setRelevantdate(d.getRelevantdate.asScala.map(patchRelevantDate).asJava)
result.setDescription(d.getDescription)
result.setDateofacceptance(asField(d.getRelevantdate.get(0).getValue))
result.setPublisher(d.getPublisher)
result.setSource(d.getSource)
result.setBestaccessright(createAccessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
val instance_urls = if (fpids.head.length < 5) s"https://www.rcsb.org/structure/${fpids.head}" else s"https://dx.doi.org/${fpids.head}"
val i: Instance = createInstance(instance_urls, firstInstanceOrNull(d.getInstance()), result.getDateofacceptance, true)
// Ticket #6281 added pid to Instance
i.setPid(result.getPid)
if (i != null)
result.setInstance(List(i).asJava)
result
}
def firstInstanceOrNull(instances: java.util.List[Instance]): Instance = {
if (instances == null || instances.size() == 0)
return null
instances.get(0)
}
def createInstance(url: String, originalInstance: Instance, doa: Field[String], dataset: Boolean = false): Instance = {
val i = new Instance
i.setUrl(List(url).asJava)
if (dataset)
i.setInstancetype(createQualifier("0021", "Dataset", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
else
i.setInstancetype(createQualifier("0000", "Unknown", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
if (originalInstance != null && originalInstance.getHostedby != null)
i.setHostedby(originalInstance.getHostedby)
i.setAccessright(createAccessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
i.setDateofacceptance(doa)
i
}
def patchRelevantDate(d: StructuredProperty): StructuredProperty = {
d.setQualifier(createQualifier(ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE))
d
}
def patchTitle(t: StructuredProperty): StructuredProperty = {
t.setQualifier(ModelConstants.MAIN_TITLE_QUALIFIER)
t
}
def convertSubject(s: StructuredProperty): StructuredProperty = {
s.setQualifier(createQualifier("keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES))
s
}
def convertAuthor(a: Author): Author = {
if (a == null)
return a
val p = new PacePerson(a.getFullname, false)
if (p.isAccurate) {
a.setName(p.getNameString)
a.setSurname(p.getSurnameString)
}
a
}
def generateId(id: String): String = {
val md5 = if (id.contains("::")) StringUtils.substringAfter(id, "::") else StringUtils.substringAfter(id, "|")
s"50|scholix_____::$md5"
}
}

@ -1,175 +0,0 @@
package eu.dnetlib.dhp.`export`
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.SparkConf
import scala.collection.mutable.ArrayBuffer
object SparkExportContentForOpenAire {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExportContentForOpenAire.getClass.getResourceAsStream("input_export_content_parameters.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkExportContentForOpenAire.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val workingPath = parser.get("workingDirPath")
implicit val dliPubEncoder: Encoder[DLIPublication] = Encoders.kryo(classOf[DLIPublication])
implicit val dliDatEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
implicit val pubEncoder: Encoder[Publication] = Encoders.bean(classOf[Publication])
implicit val datEncoder: Encoder[OafDataset] = Encoders.bean(classOf[OafDataset])
implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation])
import spark.implicits._
val dsRel = spark.read.load(s"$workingPath/relation_b").as[Relation]
dsRel.filter(r => r.getDataInfo==null || r.getDataInfo.getDeletedbyinference ==false)
.map(DLIToOAF.convertDLIRelation)
.filter(r => r!= null)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS")
val dsPubs = spark.read.load(s"$workingPath/publication").as[DLIPublication]
dsPubs
.filter(p=>p.getDataInfo.getDeletedbyinference == false)
.map(DLIToOAF.convertDLIPublicationToOAF)
.filter(p=>p!= null)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationDS")
val dsDataset = spark.read.load(s"$workingPath/dataset").as[DLIDataset]
dsDataset
.filter(p => p.getDataInfo.getDeletedbyinference == false)
.map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetDS")
val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/export/publicationDS").as[Publication]
val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/export/datasetDS").as[OafDataset]
val relDS1 :Dataset[Relation] = spark.read.load(s"$workingPath/export/relationDS").as[Relation]
val pub_id = pubs.select("id").distinct()
val dat_id = dats.select("id").distinct()
pub_id.joinWith(relDS1, pub_id("id").equalTo(relDS1("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS_f1")
val relDS2= spark.read.load(s"$workingPath/export/relationDS_f1").as[Relation]
relDS2.joinWith(dat_id, relDS2("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS_filtered")
val r_source = relDS2.select(relDS2("source")).distinct()
val r_target = relDS2.select(relDS2("target")).distinct()
val w2 = Window.partitionBy("id").orderBy("lastupdatetimestamp")
pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1)
.withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row")
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationDS_filtered")
dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1)
.withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row")
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetAS")
dsDataset.map(DLIToOAF.convertDLIDatasetToExternalReference).filter(p => p != null).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/externalReference")
val pf = spark.read.load(s"$workingPath/export/publicationDS_filtered").select("id")
val relDS3 = spark.read.load(s"$workingPath/export/relationDS").as[Relation]
val relationTo = pf.joinWith(relDS3, pf("id").equalTo(relDS3("source")),"inner").map(t =>t._2)
val extRef = spark.read.load(s"$workingPath/export/externalReference").as[DLIExternalReference]
spark.createDataset(relationTo.joinWith(extRef, relationTo("target").equalTo(extRef("id")), "inner").map(d => {
val r = d._1
val ext = d._2
(r.getSource, ext)
}).rdd.groupByKey.map(f => {
var dli_ext = ArrayBuffer[DLIExternalReference]()
f._2.foreach(d => if (dli_ext.size < 100) dli_ext += d )
(f._1, dli_ext)
})).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/externalReference_grouped")
val pubf :Dataset[Publication] = spark.read.load(s"$workingPath/export/publicationDS_filtered").as[Publication]
val groupedERf:Dataset[(String, List[DLIExternalReference])]= spark.read.load(s"$workingPath/export/externalReference_grouped").as[(String, List[DLIExternalReference])]
groupedERf.joinWith(pubf,pubf("id").equalTo(groupedERf("_1"))).map(t =>
{
val publication = t._2
if (t._1 != null) {
val eRefs = t._1._2
DLIToOAF.insertExternalRefs(publication, eRefs)
} else
publication
}
).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationAS")
dsDataset
.map(DLIToOAF.convertClinicalTrial)
.filter(p => p != null)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/clinicalTrials")
val ct:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/clinicalTrials").as[(String,String)]
val relDS= spark.read.load(s"$workingPath/export/relationDS_f1").as[Relation]
relDS.joinWith(ct, relDS("target").equalTo(ct("_1")), "inner")
.map(k =>{
val currentRel = k._1
currentRel.setTarget(k._2._2)
currentRel
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/clinicalTrialsRels")
val clRels:Dataset[Relation] = spark.read.load(s"$workingPath/export/clinicalTrialsRels").as[Relation]
val rels:Dataset[Relation] = spark.read.load(s"$workingPath/export/relationDS_filtered").as[Relation]
rels.union(clRels).flatMap(r => {
val inverseRel = new Relation
inverseRel.setSource(r.getTarget)
inverseRel.setTarget(r.getSource)
inverseRel.setDataInfo(r.getDataInfo)
inverseRel.setCollectedfrom(r.getCollectedfrom)
inverseRel.setRelType(r.getRelType)
inverseRel.setSubRelType(r.getSubRelType)
inverseRel.setRelClass(DLIToOAF.rel_inverse(r.getRelClass))
List(r, inverseRel)
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationAS")
spark.read.load(s"$workingPath/export/publicationAS").as[Publication].map(DLIToOAF.fixInstance).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationAS_fixed")
spark.read.load(s"$workingPath/export/datasetAS").as[OafDataset].map(DLIToOAF.fixInstanceDataset).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetAS_fixed")
val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/relationAS").as[Relation].map(DLIToOAF.toActionSet)
val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/publicationAS_fixed").as[Publication].map(DLIToOAF.toActionSet)
val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/datasetAS_fixed").as[OafDataset].map(DLIToOAF.toActionSet)
fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/export/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
}
}

@ -1,112 +0,0 @@
package eu.dnetlib.dhp.export.zenodo;
import java.io.*;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.MakeTarArchive;
public class MakeTar implements Serializable {
private static final Logger log = LoggerFactory.getLogger(MakeTar.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
MakeTar.class
.getResourceAsStream(
"/eu/dnetlib/dhp/export/input_maketar_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("targetPath");
log.info("hdfsPath: {}", outputPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final String inputPath = parser.get("sourcePath");
log.info("input path : {}", inputPath);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
MakeTarArchive.tarMaxSize(fileSystem, inputPath, outputPath, "scholix_dump", 25);
}
// public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException {
//
// RemoteIterator<LocatedFileStatus> dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath));
//
// while (dir_iterator.hasNext()) {
// LocatedFileStatus fileStatus = dir_iterator.next();
//
// Path p = fileStatus.getPath();
// String p_string = p.toString();
// String entity = p_string.substring(p_string.lastIndexOf("/") + 1);
//
// write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity);
// }
//
// }
//
// private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
// throws IOException {
//
// Path hdfsWritePath = new Path(outputPath);
// FSDataOutputStream fsDataOutputStream = null;
// if (fileSystem.exists(hdfsWritePath)) {
// fileSystem.delete(hdfsWritePath, true);
//
// }
// fsDataOutputStream = fileSystem.create(hdfsWritePath);
//
// TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
//
// RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
// .listFiles(
// new Path(inputPath), true);
//
// while (fileStatusListIterator.hasNext()) {
// LocatedFileStatus fileStatus = fileStatusListIterator.next();
//
// Path p = fileStatus.getPath();
// String p_string = p.toString();
// if (!p_string.endsWith("_SUCCESS")) {
// String name = p_string.substring(p_string.lastIndexOf("/") + 1);
// TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz");
// entry.setSize(fileStatus.getLen());
// ar.putArchiveEntry(entry);
//
// InputStream is = fileSystem.open(fileStatus.getPath());
//
// BufferedInputStream bis = new BufferedInputStream(is);
//
// int count;
// byte data[] = new byte[1024];
// while ((count = bis.read(data, 0, data.length)) != -1) {
// ar.write(data, 0, count);
// }
// bis.close();
// ar.closeArchiveEntry();
//
// }
//
// }
//
// ar.close();
// }
}

@ -1,80 +0,0 @@
package eu.dnetlib.dhp.export.zenodo;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.api.MissingConceptDoiException;
import eu.dnetlib.dhp.common.api.ZenodoAPIClient;
public class SendToZenodoHDFS implements Serializable {
private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class);
public static void main(final String[] args) throws Exception, MissingConceptDoiException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SendToZenodoHDFS.class
.getResourceAsStream(
"/eu/dnetlib/dhp/export/upload_zenodo.json")));
parser.parseArgument(args);
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("nameNode");
final String access_token = parser.get("accessToken");
final String connection_url = parser.get("connectionUrl");
final String metadata = parser.get("metadata");
final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition"));
final String concept_rec_id = Optional
.ofNullable(parser.get("conceptRecordId"))
.orElse(null);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(hdfsPath), true);
ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token);
if (newDeposition) {
zenodoApiClient.newDeposition();
} else {
if (concept_rec_id == null) {
throw new MissingConceptDoiException("No concept record id has been provided");
}
zenodoApiClient.newVersion(concept_rec_id);
}
while (fileStatusListIterator.hasNext()) {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
// String tmp = p_string.substring(0, p_string.lastIndexOf("/"));
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
log.info("Sending information for community: " + name);
FSDataInputStream inputStream = fileSystem.open(p);
zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen());
}
}
zenodoApiClient.sendMretadata(metadata);
// zenodoApiClient.publish();
}
}

@ -1,98 +0,0 @@
package eu.dnetlib.dhp.provision;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpDelete;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class DropAndCreateESIndex {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
DropAndCreateESIndex.class
.getResourceAsStream(
"/eu/dnetlib/dhp/provision/dropAndCreateIndex.json")));
parser.parseArgument(args);
final String index = parser.get("index");
final String cluster = parser.get("cluster");
final String clusterJson = IOUtils
.toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/cluster.json"));
final Map<String, String> clusterMap = new ObjectMapper().readValue(clusterJson, Map.class);
final String ip = clusterMap.get(cluster).split(",")[0];
System.out.println(ip);
final String url = "http://%s:9200/%s_%s";
CloseableHttpClient client = HttpClients.createDefault();
HttpDelete delete = new HttpDelete(String.format(url, ip, index, "object"));
CloseableHttpResponse response = client.execute(delete);
System.out.println("deleting Index SUMMARY");
System.out.println(response.getStatusLine());
client.close();
client = HttpClients.createDefault();
delete = new HttpDelete(String.format(url, ip, index, "scholix"));
response = client.execute(delete);
System.out.println("deleting Index SCHOLIX");
System.out.println(response.getStatusLine());
client.close();
client = HttpClients.createDefault();
final String summaryConf = IOUtils
.toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/summary_index.json"));
final String scholixConf = IOUtils
.toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/scholix_index.json"));
HttpPut put = new HttpPut(String.format(url, ip, index, "object"));
StringEntity entity = new StringEntity(summaryConf);
put.setEntity(entity);
put.setHeader("Accept", "application/json");
put.setHeader("Content-type", "application/json");
System.out.println("creating First Index SUMMARY");
response = client.execute(put);
client.close();
client = HttpClients.createDefault();
System.out.println(response.getStatusLine());
System.out.println("creating Index SCHOLIX");
put = new HttpPut(String.format(url, ip, index, "scholix"));
entity = new StringEntity(scholixConf);
put.setEntity(entity);
put.setHeader("Accept", "application/json");
put.setHeader("Content-type", "application/json");
response = client.execute(put);
System.out.println(response.getStatusLine());
client.close();
}
}

@ -1,48 +0,0 @@
package eu.dnetlib.dhp.provision;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.provision.scholix.summary.Typology;
import eu.dnetlib.dhp.utils.DHPUtils;
public class ProvisionUtil {
public static final String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference";
public static final String TARGETJSONPATH = "$.target";
public static final String SOURCEJSONPATH = "$.source";
// public static RelatedItemInfo getItemType(final String item, final String idPath) {
// String targetId = DHPUtils.getJPathString(idPath, item);
// switch (StringUtils.substringBefore(targetId, "|")) {
// case "50":
// return new RelatedItemInfo(null,0,1,0);
// case "60":
// return new RelatedItemInfo(null,1,0,0);
// case "70":
// return new RelatedItemInfo(null,0,0,1);
// default:
// throw new RuntimeException("Unknonw target ID");
//
// }
//
// }
public static Boolean isNotDeleted(final String item) {
return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item));
}
public static Typology getItemTypeFromId(String id) {
switch (StringUtils.substringBefore(id, "|")) {
case "50":
return Typology.publication;
case "60":
return Typology.dataset;
case "70":
return Typology.unknown;
default:
throw new RuntimeException("Unknonw ID type");
}
}
}

@ -1,59 +0,0 @@
package eu.dnetlib.dhp.provision;
import java.io.Serializable;
/** This class models the information of related items */
public class RelatedItemInfo implements Serializable {
private String source;
private long relatedDataset = 0;
private long relatedPublication = 0;
private long relatedUnknown = 0;
public RelatedItemInfo() {
}
public RelatedItemInfo(
String source, long relatedDataset, long relatedPublication, long relatedUnknown) {
this.source = source;
this.relatedDataset = relatedDataset;
this.relatedPublication = relatedPublication;
this.relatedUnknown = relatedUnknown;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public long getRelatedDataset() {
return relatedDataset;
}
public void setRelatedDataset(long relatedDataset) {
this.relatedDataset = relatedDataset;
}
public long getRelatedPublication() {
return relatedPublication;
}
public void setRelatedPublication(long relatedPublication) {
this.relatedPublication = relatedPublication;
}
public long getRelatedUnknown() {
return relatedUnknown;
}
public void setRelatedUnknown(int relatedUnknown) {
this.relatedUnknown = relatedUnknown;
}
}

@ -1,38 +0,0 @@
package eu.dnetlib.dhp.provision
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.provision.scholix.Scholix
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
object SparkConvertDatasetToJson {
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertDatasetToJson.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/dataset2Json.json")))
parser.parseArgument(args)
val conf = new SparkConf
val spark = SparkSession.builder.config(conf).appName(SparkConvertDatasetToJson.getClass.getSimpleName).master(parser.get("master")).getOrCreate
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
val workingPath = parser.get("workingPath")
spark.read.load(s"$workingPath/summary").as[ScholixSummary]
.map(s => new ObjectMapper().writeValueAsString(s))(Encoders.STRING)
.rdd.repartition(500).saveAsTextFile(s"$workingPath/summary_json", classOf[GzipCodec])
spark.read.load(s"$workingPath/scholix").as[Scholix]
.map(s => new ObjectMapper().writeValueAsString(s))(Encoders.STRING)
.rdd.repartition(2000).saveAsTextFile(s"$workingPath/scholix_json", classOf[GzipCodec])
}
}

@ -1,60 +0,0 @@
package eu.dnetlib.dhp.provision
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Relation
import org.apache.commons.io.IOUtils
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql.functions.{coalesce, col, count, lit}
/**
* SparkExtractRelationCount is a spark job that takes in input relation RDD and retrieve for each item in relation
* which are the number of - Related Dataset - Related Publication - Related Unknown
*/
object SparkExtractRelationCount {
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExtractRelationCount.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_related_entities_parameters.json")))
parser.parseArgument(args)
val spark = SparkSession.builder.appName(SparkExtractRelationCount.getClass.getSimpleName).master(parser.get("master")).getOrCreate
val workingDirPath = parser.get("workingDirPath")
val relationPath = parser.get("relationPath")
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
val relation = spark.read.load(relationPath).as[Relation].map(r =>r)(Encoders.bean(classOf[Relation]))
val relatedPublication = relation
.where("target like '50%'")
.groupBy("source")
.agg(count("target").as("publication"))
.select(col("source"). alias("p_source"), col("publication"))
val relatedDataset = relation
.where("target like '60%'")
.groupBy("source")
.agg(count("target").as("dataset"))
.select(col("source"). alias("d_source"), col("dataset"))
val relatedUnknown = relation
.where("target like '70%'")
.groupBy("source")
.agg(count("target").as("unknown"))
.select(col("source"). alias("u_source"), col("unknown"))
val firstJoin = relatedPublication
.join(relatedDataset,col("p_source").equalTo(col("d_source")),"full")
.select( coalesce( col("p_source"), col("d_source")).alias("id"),
col("publication"),
col("dataset"))
.join(relatedUnknown, col("u_source").equalTo(col("id")),"full")
.select( coalesce(col("u_source"), col("id")).alias("source"),
coalesce(col("publication"),lit(0)).alias("relatedPublication"),
coalesce(col("dataset"),lit(0)).alias("relatedDataset"),
coalesce(col("unknown"),lit(0)).alias("relatedUnknown")
)
firstJoin.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/relatedItemCount")
}
}

@ -1,94 +0,0 @@
package eu.dnetlib.dhp.provision
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixResource}
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import eu.dnetlib.dhp.schema.oaf.Relation
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
object SparkGenerateScholixIndex {
def getScholixAggregator(): Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix]{
override def zero: Scholix = new Scholix()
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
b.mergeFrom(a._2)
b
}
override def merge(wx: Scholix, wy: Scholix): Scholix = {
wx.mergeFrom(wy)
wx
}
override def finish(reduction: Scholix): Scholix = reduction
override def bufferEncoder: Encoder[Scholix] =
Encoders.kryo(classOf[Scholix])
override def outputEncoder: Encoder[Scholix] =
Encoders.kryo(classOf[Scholix])
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholixIndex.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")))
parser.parseArgument(args)
val conf = new SparkConf
conf.set("spark.sql.shuffle.partitions", "4000")
val spark = SparkSession.builder.config(conf).appName(SparkGenerateScholixIndex.getClass.getSimpleName).master(parser.get("master")).getOrCreate
val graphPath = parser.get("graphPath")
val workingDirPath = parser.get("workingDirPath")
implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val relEncoder:Encoder[Relation] = Encoders.kryo[Relation]
implicit val scholixEncoder:Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val tupleScholix:Encoder[(String,Scholix)]=Encoders.tuple(Encoders.STRING, scholixEncoder)
val scholixSummary:Dataset[(String,ScholixSummary)] = spark.read.load(s"$workingDirPath/summary").as[ScholixSummary]
.map(s => (s.getId, s))(Encoders.tuple(Encoders.STRING, summaryEncoder))
val sourceRelations:Dataset[(String,Relation)]= spark.read.load(s"$graphPath/relation").as[Relation]
.map(r => (r.getSource,r))(Encoders.tuple(Encoders.STRING, relEncoder))
scholixSummary.joinWith(sourceRelations, scholixSummary("_1").equalTo(sourceRelations("_1")), "inner")
.map(r=> {
val summary = r._1._2
val relation = r._2._2
(relation.getTarget, Scholix.generateScholixWithSource(summary,relation))
}).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_source")
val sTarget:Dataset[(String,Scholix)] = spark.read.load(s"$workingDirPath/scholix_source").as[(String, Scholix)]
sTarget.joinWith(scholixSummary, sTarget("_1").equalTo(scholixSummary("_1")), "inner").map(i => {
val summary = i._2._2
val scholix = i._1._2
val scholixResource = ScholixResource.fromSummary(summary)
scholix.setTarget(scholixResource)
scholix.generateIdentifier()
scholix.generatelinkPublisher()
scholix
}).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_r")
val finalScholix:Dataset[Scholix] = spark.read.load(s"$workingDirPath/scholix_r").as[Scholix]
finalScholix.map(d => (d.getIdentifier, d))(Encoders.tuple(Encoders.STRING, scholixEncoder))
.groupByKey(_._1)(Encoders.STRING)
.agg(getScholixAggregator().toColumn)
.map(p => p._2)
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix")
}
}

@ -1,70 +0,0 @@
package eu.dnetlib.dhp.provision
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
import org.apache.commons.io.IOUtils
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
object SparkGenerateSummaryIndex {
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateSummaryIndex.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")))
parser.parseArgument(args)
val spark = SparkSession.builder.appName(SparkGenerateSummaryIndex.getClass.getSimpleName).master(parser.get("master")).getOrCreate
val graphPath = parser.get("graphPath")
val workingDirPath = parser.get("workingDirPath")
implicit val relatedItemInfoEncoders: Encoder[RelatedItemInfo] = Encoders.bean(classOf[RelatedItemInfo])
implicit val datasetEncoder:Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
implicit val publicationEncoder:Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
implicit val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation]
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val oafWithIdEncoder: Encoder[(String, Oaf)] = Encoders.tuple(Encoders.STRING, oafEncoder)
implicit val scholixSummaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixSummaryEncoderTuple: Encoder[(String,ScholixSummary)] = Encoders.tuple(Encoders.STRING,scholixSummaryEncoder)
val pubs = spark.read.load(s"$graphPath/publication").as[Oaf].map(o => (o.asInstanceOf[DLIPublication].getId, o))
val dats = spark.read.load(s"$graphPath/dataset").as[Oaf].map(o => (o.asInstanceOf[DLIDataset].getId, o))
val ukn = spark.read.load(s"$graphPath/unknown").as[Oaf].map(o => (o.asInstanceOf[DLIUnknown].getId, o))
val summary:Dataset[(String,ScholixSummary)] = pubs.union(dats).union(ukn).map(o =>{
val s = ScholixSummary.fromOAF(o._2)
(s.getId,s)
})
val relatedItemInfoDs:Dataset[RelatedItemInfo] = spark.read.load(s"$workingDirPath/relatedItemCount").as[RelatedItemInfo]
summary.joinWith(relatedItemInfoDs, summary("_1").equalTo(relatedItemInfoDs("source")), "inner")
.map(i => {
val summary = i._1._2
val relatedItemInfo = i._2
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset)
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication)
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown)
summary
}).filter(s => s.getLocalIdentifier != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/summary")
}
}

@ -1,61 +0,0 @@
package eu.dnetlib.dhp.provision;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class
SparkIndexCollectionOnES {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkIndexCollectionOnES.class
.getResourceAsStream(
"/eu/dnetlib/dhp/provision/index_on_es.json")));
parser.parseArgument(args);
SparkConf conf = new SparkConf()
.setAppName(SparkIndexCollectionOnES.class.getSimpleName())
.setMaster(parser.get("master"));
conf.set("spark.sql.shuffle.partitions", "4000");
final String sourcePath = parser.get("sourcePath");
final String index = parser.get("index");
final String idPath = parser.get("idPath");
final String cluster = parser.get("cluster");
final String clusterJson = IOUtils
.toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/cluster.json"));
final Map<String, String> clusterMap = new ObjectMapper().readValue(clusterJson, Map.class);
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<String> inputRdd = sc.textFile(sourcePath);
Map<String, String> esCfg = new HashMap<>();
esCfg.put("es.nodes", clusterMap.get(cluster));
esCfg.put("es.mapping.id", idPath);
esCfg.put("es.batch.write.retry.count", "8");
esCfg.put("es.batch.write.retry.wait", "60s");
esCfg.put("es.batch.size.entries", "200");
esCfg.put("es.nodes.wan.only", "true");
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
}
}

@ -1,286 +0,0 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
public class Scholix implements Serializable {
private String publicationDate;
private List<ScholixEntityId> publisher;
private List<ScholixEntityId> linkprovider;
private ScholixRelationship relationship;
private ScholixResource source;
private ScholixResource target;
private String identifier;
public Scholix clone(final ScholixResource t) {
final Scholix clone = new Scholix();
clone.setPublicationDate(publicationDate);
clone.setPublisher(publisher);
clone.setLinkprovider(linkprovider);
clone.setRelationship(relationship);
clone.setSource(source);
clone.setTarget(t);
clone.generatelinkPublisher();
clone.generateIdentifier();
return clone;
}
public static Scholix generateScholixWithSource(
final String sourceSummaryJson, final String relation) {
final ObjectMapper mapper = new ObjectMapper();
try {
ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class);
Relation rel = mapper.readValue(relation, Relation.class);
final Scholix s = new Scholix();
if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0)
s.setPublicationDate(scholixSummary.getDate().get(0));
s
.setLinkprovider(
rel
.getCollectedfrom()
.stream()
.map(
cf -> new ScholixEntityId(
cf.getValue(),
Collections
.singletonList(
new ScholixIdentifier(cf.getKey(), "dnet_identifier"))))
.collect(Collectors.toList()));
s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null));
s.setSource(ScholixResource.fromSummary(scholixSummary));
return s;
} catch (Throwable e) {
throw new RuntimeException(
String.format("Summary: %s \n relation:%s", sourceSummaryJson, relation), e);
}
}
public static Scholix generateScholixWithSource(
final ScholixSummary scholixSummary, final Relation rel) {
final Scholix s = new Scholix();
if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0)
s.setPublicationDate(scholixSummary.getDate().get(0));
s
.setLinkprovider(
rel
.getCollectedfrom()
.stream()
.map(
cf -> new ScholixEntityId(
cf.getValue(),
Collections
.singletonList(
new ScholixIdentifier(cf.getKey(), "dnet_identifier"))))
.collect(Collectors.toList()));
s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null));
s.setSource(ScholixResource.fromSummary(scholixSummary));
s.setIdentifier(rel.getTarget());
return s;
}
private List<ScholixEntityId> mergeScholixEntityId(final List<ScholixEntityId> a, final List<ScholixEntityId> b) {
final List<ScholixEntityId> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
if (b != null)
b.forEach(s -> {
if (s != null) {
int tt = (int) m
.stream()
.filter(t -> t != null && t.getName() != null && t.getName().equalsIgnoreCase(s.getName()))
.count();
if (tt == 0) {
m.add(s);
}
}
});
return m;
}
private List<ScholixIdentifier> mergeScholixIdnetifier(final List<ScholixIdentifier> a,
final List<ScholixIdentifier> b) {
final List<ScholixIdentifier> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
if (b != null)
b.forEach(s -> {
int tt = (int) m.stream().filter(t -> t.getIdentifier().equalsIgnoreCase(s.getIdentifier())).count();
if (tt == 0) {
m.add(s);
}
});
return m;
}
private List<ScholixCollectedFrom> mergeScholixCollectedFrom(final List<ScholixCollectedFrom> a,
final List<ScholixCollectedFrom> b) {
final List<ScholixCollectedFrom> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
if (b != null)
b.forEach(s -> {
int tt = (int) m
.stream()
.filter(t -> t.getProvider().getName().equalsIgnoreCase(s.getProvider().getName()))
.count();
if (tt == 0) {
m.add(s);
}
});
return m;
}
private ScholixRelationship mergeRelationships(final ScholixRelationship a, final ScholixRelationship b) {
ScholixRelationship result = new ScholixRelationship();
result.setName(a == null || StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName());
result.setInverse(a == null || StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse());
result.setSchema(a == null || StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema());
return result;
}
private ScholixResource mergeResource(final ScholixResource a, final ScholixResource b) {
if (a == null)
return b;
final ScholixResource result = new ScholixResource();
result.setCollectedFrom(mergeScholixCollectedFrom(a.getCollectedFrom(), b.getCollectedFrom()));
result.setCreator(mergeScholixEntityId(a.getCreator(), b.getCreator()));
result
.setDnetIdentifier(
StringUtils.isBlank(a.getDnetIdentifier()) ? b.getDnetIdentifier() : a.getDnetIdentifier());
result.setIdentifier(mergeScholixIdnetifier(a.getIdentifier(), b.getIdentifier()));
result.setObjectType(StringUtils.isNotBlank(a.getObjectType()) ? a.getObjectType() : b.getObjectType());
result
.setObjectSubType(
StringUtils.isNotBlank(a.getObjectSubType()) ? a.getObjectSubType() : b.getObjectSubType());
result.setPublisher(mergeScholixEntityId(a.getPublisher(), b.getPublisher()));
result
.setPublicationDate(
StringUtils.isNotBlank(a.getPublicationDate()) ? a.getPublicationDate() : b.getPublicationDate());
result.setTitle(StringUtils.isNotBlank(a.getTitle()) ? a.getTitle() : b.getTitle());
return result;
}
public void mergeFrom(final Scholix other) {
linkprovider = mergeScholixEntityId(linkprovider, other.getLinkprovider());
publisher = mergeScholixEntityId(publisher, other.getPublisher());
if (StringUtils.isEmpty(publicationDate))
publicationDate = other.getPublicationDate();
relationship = mergeRelationships(relationship, other.getRelationship());
source = mergeResource(source, other.getSource());
target = mergeResource(target, other.getTarget());
generateIdentifier();
}
public void generatelinkPublisher() {
Set<String> publisher = new HashSet<>();
if (source.getPublisher() != null)
publisher
.addAll(
source
.getPublisher()
.stream()
.map(ScholixEntityId::getName)
.collect(Collectors.toList()));
if (target.getPublisher() != null)
publisher
.addAll(
target
.getPublisher()
.stream()
.map(ScholixEntityId::getName)
.collect(Collectors.toList()));
this.publisher = publisher.stream().map(k -> new ScholixEntityId(k, null)).collect(Collectors.toList());
}
public void generateIdentifier() {
setIdentifier(
DHPUtils
.md5(
String
.format(
"%s::%s::%s",
source.getDnetIdentifier(), relationship.getName(), target.getDnetIdentifier())));
}
public Scholix addTarget(final String targetSummaryJson) {
final ObjectMapper mapper = new ObjectMapper();
try {
ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class);
setTarget(ScholixResource.fromSummary(targetSummary));
generateIdentifier();
return this;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public String getPublicationDate() {
return publicationDate;
}
public void setPublicationDate(String publicationDate) {
this.publicationDate = publicationDate;
}
public List<ScholixEntityId> getPublisher() {
return publisher;
}
public void setPublisher(List<ScholixEntityId> publisher) {
this.publisher = publisher;
}
public List<ScholixEntityId> getLinkprovider() {
return linkprovider;
}
public void setLinkprovider(List<ScholixEntityId> linkprovider) {
this.linkprovider = linkprovider;
}
public ScholixRelationship getRelationship() {
return relationship;
}
public void setRelationship(ScholixRelationship relationship) {
this.relationship = relationship;
}
public ScholixResource getSource() {
return source;
}
public void setSource(ScholixResource source) {
this.source = source;
}
public ScholixResource getTarget() {
return target;
}
public void setTarget(ScholixResource target) {
this.target = target;
}
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
}

@ -1,45 +0,0 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
public class ScholixCollectedFrom implements Serializable {
private ScholixEntityId provider;
private String provisionMode;
private String completionStatus;
public ScholixCollectedFrom() {
}
public ScholixCollectedFrom(
ScholixEntityId provider, String provisionMode, String completionStatus) {
this.provider = provider;
this.provisionMode = provisionMode;
this.completionStatus = completionStatus;
}
public ScholixEntityId getProvider() {
return provider;
}
public void setProvider(ScholixEntityId provider) {
this.provider = provider;
}
public String getProvisionMode() {
return provisionMode;
}
public void setProvisionMode(String provisionMode) {
this.provisionMode = provisionMode;
}
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
}

@ -1,34 +0,0 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
import java.util.List;
public class ScholixEntityId implements Serializable {
private String name;
private List<ScholixIdentifier> identifiers;
public ScholixEntityId() {
}
public ScholixEntityId(String name, List<ScholixIdentifier> identifiers) {
this.name = name;
this.identifiers = identifiers;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public List<ScholixIdentifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<ScholixIdentifier> identifiers) {
this.identifiers = identifiers;
}
}

@ -1,33 +0,0 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
public class ScholixIdentifier implements Serializable {
private String identifier;
private String schema;
public ScholixIdentifier() {
}
public ScholixIdentifier(String identifier, String schema) {
this.identifier = identifier;
this.schema = schema;
}
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public String getSchema() {
return schema;
}
public void setSchema(String schema) {
this.schema = schema;
}
}

@ -1,43 +0,0 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
public class ScholixRelationship implements Serializable {
private String name;
private String schema;
private String inverse;
public ScholixRelationship() {
}
public ScholixRelationship(String name, String schema, String inverse) {
this.name = name;
this.schema = schema;
this.inverse = inverse;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSchema() {
return schema;
}
public void setSchema(String schema) {
this.schema = schema;
}
public String getInverse() {
return inverse;
}
public void setInverse(String inverse) {
this.inverse = inverse;
}
}

@ -1,151 +0,0 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
public class ScholixResource implements Serializable {
private List<ScholixIdentifier> identifier;
private String dnetIdentifier;
private String objectType;
private String objectSubType;
private String title;
private List<ScholixEntityId> creator;
private String publicationDate;
private List<ScholixEntityId> publisher;
private List<ScholixCollectedFrom> collectedFrom;
public static ScholixResource fromSummary(ScholixSummary summary) {
final ScholixResource resource = new ScholixResource();
resource.setDnetIdentifier(summary.getId());
resource
.setIdentifier(
summary
.getLocalIdentifier()
.stream()
.map(i -> new ScholixIdentifier(i.getId(), i.getType()))
.collect(Collectors.toList()));
resource.setObjectType(summary.getTypology().toString());
if (summary.getTitle() != null && summary.getTitle().size() > 0)
resource.setTitle(summary.getTitle().get(0));
if (summary.getAuthor() != null)
resource
.setCreator(
summary
.getAuthor()
.stream()
.map(c -> new ScholixEntityId(c, null))
.collect(Collectors.toList()));
if (summary.getDate() != null && summary.getDate().size() > 0)
resource.setPublicationDate(summary.getDate().get(0));
if (summary.getPublisher() != null)
resource
.setPublisher(
summary
.getPublisher()
.stream()
.map(p -> new ScholixEntityId(p, null))
.collect(Collectors.toList()));
if (summary.getDatasources() != null)
resource
.setCollectedFrom(
summary
.getDatasources()
.stream()
.map(
d -> new ScholixCollectedFrom(
new ScholixEntityId(
d.getDatasourceName(),
Collections
.singletonList(
new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))),
"collected",
d.getCompletionStatus()))
.collect(Collectors.toList()));
return resource;
}
public List<ScholixIdentifier> getIdentifier() {
return identifier;
}
public void setIdentifier(List<ScholixIdentifier> identifier) {
this.identifier = identifier;
}
public String getDnetIdentifier() {
return dnetIdentifier;
}
public void setDnetIdentifier(String dnetIdentifier) {
this.dnetIdentifier = dnetIdentifier;
}
public String getObjectType() {
return objectType;
}
public void setObjectType(String objectType) {
this.objectType = objectType;
}
public String getObjectSubType() {
return objectSubType;
}
public void setObjectSubType(String objectSubType) {
this.objectSubType = objectSubType;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public List<ScholixEntityId> getCreator() {
return creator;
}
public void setCreator(List<ScholixEntityId> creator) {
this.creator = creator;
}
public String getPublicationDate() {
return publicationDate;
}
public void setPublicationDate(String publicationDate) {
this.publicationDate = publicationDate;
}
public List<ScholixEntityId> getPublisher() {
return publisher;
}
public void setPublisher(List<ScholixEntityId> publisher) {
this.publisher = publisher;
}
public List<ScholixCollectedFrom> getCollectedFrom() {
return collectedFrom;
}
public void setCollectedFrom(List<ScholixCollectedFrom> collectedFrom) {
this.collectedFrom = collectedFrom;
}
}

@ -1,44 +0,0 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import java.io.Serializable;
public class CollectedFromType implements Serializable {
private String datasourceName;
private String datasourceId;
private String completionStatus;
public CollectedFromType() {
}
public CollectedFromType(String datasourceName, String datasourceId, String completionStatus) {
this.datasourceName = datasourceName;
this.datasourceId = datasourceId;
this.completionStatus = completionStatus;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(String datasourceName) {
this.datasourceName = datasourceName;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
}

@ -1,33 +0,0 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import java.io.Serializable;
public class SchemeValue implements Serializable {
private String scheme;
private String value;
public SchemeValue() {
}
public SchemeValue(String scheme, String value) {
this.scheme = scheme;
this.value = value;
}
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

@ -1,321 +0,0 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import java.io.Serializable;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.provision.RelatedItemInfo;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
public class ScholixSummary implements Serializable {
private String id;
private List<TypedIdentifier> localIdentifier;
private Typology typology;
private List<String> title;
private List<String> author;
private List<String> date;
private String description;
private List<SchemeValue> subject;
private List<String> publisher;
private long relatedPublications;
private long relatedDatasets;
private long relatedUnknown;
private List<CollectedFromType> datasources;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<TypedIdentifier> getLocalIdentifier() {
return localIdentifier;
}
public void setLocalIdentifier(List<TypedIdentifier> localIdentifier) {
this.localIdentifier = localIdentifier;
}
public Typology getTypology() {
return typology;
}
public void setTypology(Typology typology) {
this.typology = typology;
}
public List<String> getTitle() {
return title;
}
public void setTitle(List<String> title) {
this.title = title;
}
public List<String> getAuthor() {
return author;
}
public void setAuthor(List<String> author) {
this.author = author;
}
public List<String> getDate() {
return date;
}
public void setDate(List<String> date) {
this.date = date;
}
@JsonProperty("abstract")
public String getDescription() {
return description;
}
@JsonProperty("abstract")
public void setDescription(String description) {
this.description = description;
}
public List<SchemeValue> getSubject() {
return subject;
}
public void setSubject(List<SchemeValue> subject) {
this.subject = subject;
}
public List<String> getPublisher() {
return publisher;
}
public void setPublisher(List<String> publisher) {
this.publisher = publisher;
}
public long getRelatedPublications() {
return relatedPublications;
}
public void setRelatedPublications(long relatedPublications) {
this.relatedPublications = relatedPublications;
}
public long getRelatedDatasets() {
return relatedDatasets;
}
public void setRelatedDatasets(long relatedDatasets) {
this.relatedDatasets = relatedDatasets;
}
public long getRelatedUnknown() {
return relatedUnknown;
}
public void setRelatedUnknown(long relatedUnknown) {
this.relatedUnknown = relatedUnknown;
}
public List<CollectedFromType> getDatasources() {
return datasources;
}
public void setDatasources(List<CollectedFromType> datasources) {
this.datasources = datasources;
}
public static ScholixSummary fromOAF(final Oaf oaf) {
try {
final RelatedItemInfo relatedItemInfo = new RelatedItemInfo();
if (oaf instanceof DLIPublication)
return summaryFromPublication((DLIPublication) oaf, relatedItemInfo);
if (oaf instanceof DLIDataset)
return summaryFromDataset((DLIDataset) oaf, relatedItemInfo);
if (oaf instanceof DLIUnknown)
return summaryFromUnknown((DLIUnknown) oaf, relatedItemInfo);
} catch (Throwable e) {
throw new RuntimeException(e);
}
return null;
}
private static ScholixSummary summaryFromDataset(
final DLIDataset item, final RelatedItemInfo relatedItemInfo) {
ScholixSummary summary = new ScholixSummary();
summary.setId(item.getId());
if (item.getPid() != null)
summary
.setLocalIdentifier(
item
.getPid()
.stream()
.map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid()))
.collect(Collectors.toList()));
summary.setTypology(Typology.dataset);
if (item.getTitle() != null)
summary
.setTitle(
item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
if (item.getAuthor() != null) {
summary
.setAuthor(
item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList()));
}
if (item.getRelevantdate() != null)
summary
.setDate(
item
.getRelevantdate()
.stream()
.filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname()))
.map(StructuredProperty::getValue)
.collect(Collectors.toList()));
if (item.getDescription() != null && item.getDescription().size() > 0)
summary.setDescription(item.getDescription().get(0).getValue());
if (item.getSubject() != null) {
summary
.setSubject(
item
.getSubject()
.stream()
.map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue()))
.collect(Collectors.toList()));
}
if (item.getPublisher() != null)
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
if (item.getDlicollectedfrom() != null)
summary
.setDatasources(
item
.getDlicollectedfrom()
.stream()
.map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()))
.collect(Collectors.toList()));
return summary;
}
private static ScholixSummary summaryFromPublication(
final DLIPublication item, final RelatedItemInfo relatedItemInfo) {
ScholixSummary summary = new ScholixSummary();
summary.setId(item.getId());
if (item.getPid() != null)
summary
.setLocalIdentifier(
item
.getPid()
.stream()
.map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid()))
.collect(Collectors.toList()));
summary.setTypology(Typology.publication);
if (item.getTitle() != null)
summary
.setTitle(
item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
if (item.getAuthor() != null) {
summary
.setAuthor(
item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList()));
}
if (item.getRelevantdate() != null)
summary
.setDate(
item
.getRelevantdate()
.stream()
.filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname()))
.map(StructuredProperty::getValue)
.collect(Collectors.toList()));
if (item.getDescription() != null && item.getDescription().size() > 0)
summary.setDescription(item.getDescription().get(0).getValue());
if (item.getSubject() != null) {
summary
.setSubject(
item
.getSubject()
.stream()
.map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue()))
.collect(Collectors.toList()));
}
if (item.getPublisher() != null)
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
if (item.getDlicollectedfrom() != null)
summary
.setDatasources(
item
.getDlicollectedfrom()
.stream()
.map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()))
.collect(Collectors.toList()));
return summary;
}
private static ScholixSummary summaryFromUnknown(
final DLIUnknown item, final RelatedItemInfo relatedItemInfo) {
ScholixSummary summary = new ScholixSummary();
summary.setId(item.getId());
if (item.getPid() != null)
summary
.setLocalIdentifier(
item
.getPid()
.stream()
.map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid()))
.collect(Collectors.toList()));
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
summary.setTypology(Typology.unknown);
if (item.getDlicollectedfrom() != null)
summary
.setDatasources(
item
.getDlicollectedfrom()
.stream()
.map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()))
.collect(Collectors.toList()));
return summary;
}
}

@ -1,33 +0,0 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import java.io.Serializable;
public class TypedIdentifier implements Serializable {
private String id;
private String type;
public TypedIdentifier() {
}
public TypedIdentifier(String id, String type) {
this.id = id;
this.type = type;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}

@ -1,8 +0,0 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import java.io.Serializable;
public enum Typology implements Serializable {
dataset, publication, unknown
}

@ -1,131 +0,0 @@
package eu.dnetlib.dhp.provision.update;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.provision.scholix.ScholixCollectedFrom;
import eu.dnetlib.dhp.provision.scholix.ScholixEntityId;
import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier;
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
import eu.dnetlib.dhp.utils.DHPUtils;
public class CrossRefParserJSON {
private static final List<ScholixCollectedFrom> collectedFrom = generateCrossrefCollectedFrom("complete");
public static ScholixResource parseRecord(final String record) {
if (record == null)
return null;
JsonElement jElement = new JsonParser().parse(record);
JsonElement source = null;
if (jElement.getAsJsonObject().has("_source")) {
source = jElement.getAsJsonObject().get("_source");
if (source == null || !source.isJsonObject())
return null;
} else if (jElement.getAsJsonObject().has("DOI")) {
source = jElement;
} else {
return null;
}
final JsonObject message = source.getAsJsonObject();
ScholixResource currentObject = new ScholixResource();
if (message.get("DOI") != null) {
final String doi = message.get("DOI").getAsString();
currentObject.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi")));
}
if ((!message.get("created").isJsonNull())
&& (message.getAsJsonObject("created").get("date-time") != null)) {
currentObject
.setPublicationDate(
message.getAsJsonObject("created").get("date-time").getAsString());
}
if (message.get("title") != null
&& !message.get("title").isJsonNull()
&& message.get("title").isJsonArray()) {
JsonArray array = message.get("title").getAsJsonArray();
currentObject.setTitle(array.get(0).getAsString());
}
if (message.get("author") != null && !message.get("author").isJsonNull()) {
JsonArray author = message.getAsJsonArray("author");
List<ScholixEntityId> authorList = new ArrayList<>();
for (JsonElement anAuthor : author) {
JsonObject currentAuth = anAuthor.getAsJsonObject();
String family = "";
String given = "";
if (currentAuth != null
&& currentAuth.get("family") != null
&& !currentAuth.get("family").isJsonNull()) {
family = currentAuth.get("family").getAsString();
}
if (currentAuth != null
&& currentAuth.get("given") != null
&& !currentAuth.get("given").isJsonNull()) {
given = currentAuth.get("given").getAsString();
}
authorList.add(new ScholixEntityId(String.format("%s %s", family, given), null));
}
currentObject.setCreator(authorList);
}
if (message.get("publisher") != null && !message.get("publisher").isJsonNull()) {
currentObject
.setPublisher(
Collections
.singletonList(
new ScholixEntityId(message.get("publisher").getAsString(), null)));
}
currentObject.setCollectedFrom(collectedFrom);
currentObject.setObjectType("publication");
currentObject
.setDnetIdentifier(
generateId(message.get("DOI").getAsString(), "doi", "publication"));
return currentObject;
}
private static List<ScholixCollectedFrom> generateCrossrefCollectedFrom(
final String completionStatus) {
final ScholixEntityId scholixEntityId = new ScholixEntityId(
"Crossref",
Collections
.singletonList(
new ScholixIdentifier("dli_________::crossref", "dnet_identifier")));
return Collections
.singletonList(
new ScholixCollectedFrom(scholixEntityId, "resolved", completionStatus));
}
private static String generateId(
final String pid, final String pidType, final String entityType) {
String type;
switch (entityType) {
case "publication":
type = "50|";
break;
case "dataset":
type = "60|";
break;
case "unknown":
type = "70|";
break;
default:
throw new IllegalArgumentException("unexpected value " + entityType);
}
return type
+ DHPUtils
.md5(
String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
}
}

@ -1,90 +0,0 @@
package eu.dnetlib.dhp.provision.update;
import java.io.ByteArrayOutputStream;
import java.util.zip.Inflater;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
public class CrossrefClient {
private String host;
private String index = "crossref";
private String indexType = "item";
public CrossrefClient(String host) {
this.host = host;
}
public String getHost() {
return host;
}
public void setHost(String host) {
this.host = host;
}
public String getIndex() {
return index;
}
public void setIndex(String index) {
this.index = index;
}
public String getIndexType() {
return indexType;
}
public void setIndexType(String indexType) {
this.indexType = indexType;
}
private static String decompressBlob(final String blob) {
try {
byte[] byteArray = Base64.decodeBase64(blob.getBytes());
final Inflater decompresser = new Inflater();
decompresser.setInput(byteArray);
final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
byte[] buffer = new byte[8192];
while (!decompresser.finished()) {
int size = decompresser.inflate(buffer);
bos.write(buffer, 0, size);
}
decompresser.end();
return bos.toString();
} catch (Throwable e) {
throw new RuntimeException("Wrong record:" + blob, e);
}
}
public ScholixResource getResourceByDOI(final String doi) {
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet(
String
.format(
"http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F")));
CloseableHttpResponse response = client.execute(httpGet);
String json = IOUtils.toString(response.getEntity().getContent());
if (json.contains("blob")) {
JsonParser p = new JsonParser();
final JsonElement root = p.parse(json);
json = decompressBlob(
root.getAsJsonObject().get("_source").getAsJsonObject().get("blob").getAsString());
}
return CrossRefParserJSON.parseRecord(json);
} catch (Throwable e) {
return null;
}
}
}

@ -1,229 +0,0 @@
package eu.dnetlib.dhp.provision.update;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.provision.scholix.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.scholexplorer.relation.RelInfo;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
public class Datacite2Scholix {
private String rootPath = "$.attributes";
final RelationMapper relationMapper;
public Datacite2Scholix(RelationMapper relationMapper) {
this.relationMapper = relationMapper;
}
public List<Scholix> generateScholixFromJson(final String dJson) {
List<Map<String, String>> relIds = getRelatedIendtifiers(dJson);
relIds = relIds != null
? relIds
.stream()
.filter(
m -> m.containsKey("relatedIdentifierType")
&& m.containsKey("relationType")
&& m.containsKey("relatedIdentifier"))
.collect(Collectors.toList())
: null;
if (relIds == null || relIds.size() == 0)
return null;
final String updated = JsonPath.read(dJson, rootPath + ".updated");
ScholixResource resource = generateDataciteScholixResource(dJson);
return relIds
.stream()
.flatMap(
s -> {
try {
final List<Scholix> result = generateScholix(
resource,
"" + s.get("relatedIdentifier"),
s.get("relatedIdentifierType"),
s.get("relationType"),
updated);
return result.stream();
} catch (Throwable e) {
return new ArrayList<Scholix>().stream();
}
})
.collect(Collectors.toList());
}
public String getRootPath() {
return rootPath;
}
public void setRootPath(String rootPath) {
this.rootPath = rootPath;
}
private List<Scholix> generateScholix(
ScholixResource source,
final String pid,
final String pidtype,
final String relType,
final String updated) {
if ("doi".equalsIgnoreCase(pidtype)) {
ScholixResource target = new ScholixResource();
target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype)));
final RelInfo relInfo = relationMapper.get(relType.toLowerCase());
final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite",
relInfo.getInverse());
final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider();
final Scholix s = new Scholix();
s.setSource(source);
s.setTarget(target);
s.setLinkprovider(Collections.singletonList(provider));
s.setPublisher(source.getPublisher());
s.setRelationship(rel);
s.setPublicationDate(updated);
return Collections.singletonList(s);
} else {
final List<Scholix> result = new ArrayList<>();
ScholixResource target = new ScholixResource();
target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype)));
target.setDnetIdentifier(generateId(pid, pidtype, "unknown"));
target.setObjectType("unknown");
target.setCollectedFrom(generateDataciteCollectedFrom("incomplete"));
final RelInfo relInfo = relationMapper.get(relType.toLowerCase());
final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite",
relInfo.getInverse());
final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider();
final Scholix s = new Scholix();
s.setSource(source);
s.setTarget(target);
s.setLinkprovider(Collections.singletonList(provider));
s.setPublisher(source.getPublisher());
s.setRelationship(rel);
s.setPublicationDate(updated);
s.generateIdentifier();
result.add(s);
final Scholix s2 = new Scholix();
s2.setSource(target);
s2.setTarget(source);
s2.setLinkprovider(Collections.singletonList(provider));
s2.setPublisher(source.getPublisher());
s2
.setRelationship(
new ScholixRelationship(relInfo.getInverse(), "datacite", relInfo.getOriginal()));
s2.setPublicationDate(updated);
s2.generateIdentifier();
result.add(s2);
return result;
}
}
public ScholixResource generateDataciteScholixResource(String dJson) {
ScholixResource resource = new ScholixResource();
String DOI_PATH = rootPath + ".doi";
final String doi = JsonPath.read(dJson, DOI_PATH);
resource.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi")));
resource.setObjectType(getType(dJson));
resource.setDnetIdentifier(generateId(doi, "doi", resource.getObjectType()));
resource.setCollectedFrom(generateDataciteCollectedFrom("complete"));
final String publisher = JsonPath.read(dJson, rootPath + ".publisher");
if (StringUtils.isNotBlank(publisher))
resource.setPublisher(Collections.singletonList(new ScholixEntityId(publisher, null)));
final String date = getDate(dJson);
if (StringUtils.isNotBlank(date))
resource.setPublicationDate(date);
final String title = getTitle(dJson);
if (StringUtils.isNotBlank(title))
resource.setTitle(title);
resource.setCreator(getCreators(dJson));
return resource;
}
private List<ScholixEntityId> getCreators(final String json) {
final List<String> creatorName = JsonPath.read(json, rootPath + ".creators[*].name");
if (creatorName != null && creatorName.size() > 0) {
return creatorName
.stream()
.map(s -> new ScholixEntityId(s, null))
.collect(Collectors.toList());
}
return null;
}
private String getTitle(final String json) {
final List<String> titles = JsonPath.read(json, rootPath + ".titles[*].title");
return titles != null && titles.size() > 0 ? titles.get(0) : null;
}
private String getDate(final String json) {
final List<Map<String, String>> dates = JsonPath.read(json, rootPath + ".dates");
if (dates != null && dates.size() > 0) {
List<Map<String, String>> issued = dates
.stream()
.filter(s -> "issued".equalsIgnoreCase(s.get("dateType")))
.collect(Collectors.toList());
if (issued.size() > 0)
return issued.get(0).get("date");
}
return null;
}
private List<ScholixCollectedFrom> generateDataciteCollectedFrom(final String completionStatus) {
final ScholixEntityId scholixEntityId = new ScholixEntityId(
"Datasets in Datacite",
Collections
.singletonList(
new ScholixIdentifier("dli_________::datacite", "dnet_identifier")));
return Collections
.singletonList(
new ScholixCollectedFrom(scholixEntityId, "collected", completionStatus));
}
private String getType(final String json) {
try {
final String bibtext = JsonPath.read(json, rootPath + ".types.bibtex");
if ("article".equalsIgnoreCase(bibtext)) {
return "publication";
}
return "dataset";
} catch (Throwable e) {
return "dataset";
}
}
private List<Map<String, String>> getRelatedIendtifiers(final String json) {
String REL_IDENTIFIER_PATH = rootPath + ".relatedIdentifiers[*]";
List<Map<String, String>> res = JsonPath.read(json, REL_IDENTIFIER_PATH);
return res;
}
public static String generateId(final String pid, final String pidType, final String entityType) {
String type;
switch (entityType) {
case "publication":
type = "50|";
break;
case "dataset":
type = "60|";
break;
case "unknown":
type = "70|";
break;
default:
throw new IllegalArgumentException("unexpected value " + entityType);
}
return type
+ DHPUtils
.md5(
String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
}
}

@ -1,75 +0,0 @@
package eu.dnetlib.dhp.provision.update;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
public class DataciteClient {
private String host;
private String index = "datacite";
private String indexType = "dump";
private final Datacite2Scholix d2s;
public DataciteClient(String host) {
this.host = host;
d2s = new Datacite2Scholix(null);
d2s.setRootPath("$._source.attributes");
}
public Iterable<String> getDatasetsFromTs(final Long timestamp) {
return () -> {
try {
return new DataciteClientIterator(host, index, timestamp);
} catch (IOException e) {
throw new RuntimeException(e);
}
};
}
public String getHost() {
return host;
}
public void setHost(String host) {
this.host = host;
}
public String getIndex() {
return index;
}
public void setIndex(String index) {
this.index = index;
}
public String getIndexType() {
return indexType;
}
public void setIndexType(String indexType) {
this.indexType = indexType;
}
public ScholixResource getDatasetByDOI(final String doi) {
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet(
String
.format(
"http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F")));
CloseableHttpResponse response = client.execute(httpGet);
final String json = IOUtils.toString(response.getEntity().getContent());
return d2s.generateDataciteScholixResource(json);
} catch (Throwable e) {
return null;
}
}
}

@ -1,120 +0,0 @@
package eu.dnetlib.dhp.provision.update;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray;
public class DataciteClientIterator implements Iterator<String> {
static final String blobPath = "$.hits.hits[*]._source";
static final String scrollIdPath = "$._scroll_id";
String scrollId;
List<String> buffer;
final String esHost;
final String esIndex;
final ObjectMapper mapper = new ObjectMapper();
public DataciteClientIterator(final String esHost, final String esIndex, long timestamp)
throws IOException {
this.esHost = esHost;
this.esIndex = esIndex;
// THIS FIX IS NECESSARY to avoid different timezone
timestamp -= (60 * 60 * 2);
final String body = getResponse(
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
String
.format(
"{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", timestamp));
scrollId = getJPathString(scrollIdPath, body);
buffer = getBlobs(body);
}
public String getResponse(final String url, final String json) {
CloseableHttpClient client = HttpClients.createDefault();
try {
HttpPost httpPost = new HttpPost(url);
if (json != null) {
StringEntity entity = new StringEntity(json);
httpPost.setEntity(entity);
httpPost.setHeader("Accept", "application/json");
httpPost.setHeader("Content-type", "application/json");
}
CloseableHttpResponse response = client.execute(httpPost);
return IOUtils.toString(response.getEntity().getContent());
} catch (Throwable e) {
throw new RuntimeException("Error on executing request ", e);
} finally {
try {
client.close();
} catch (IOException e) {
throw new RuntimeException("Unable to close client ", e);
}
}
}
private String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String) o;
return null;
} catch (Exception e) {
return "";
}
}
private List<String> getBlobs(final String body) {
JSONArray array = JsonPath.read(body, blobPath);
return array
.stream()
.map(
o -> {
try {
return mapper.writeValueAsString(o);
} catch (Throwable e) {
throw new RuntimeException(e);
}
})
.collect(Collectors.toList());
}
@Override
public boolean hasNext() {
return (buffer != null && !buffer.isEmpty());
}
@Override
public String next() {
final String nextItem = buffer.remove(0);
if (buffer.isEmpty()) {
final String json_param = String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId);
final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
try {
buffer = getBlobs(body);
} catch (Throwable e) {
System.out.println(body);
}
}
return nextItem;
}
}

@ -1,72 +0,0 @@
package eu.dnetlib.dhp.provision.update;
import java.net.URI;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.provision.scholix.Scholix;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
public class RetrieveUpdateFromDatacite {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
RetrieveUpdateFromDatacite.class
.getResourceAsStream(
"/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json")));
parser.parseArgument(args);
final String hdfsuri = parser.get("namenode");
Path hdfswritepath = new Path(parser.get("targetPath"));
final long timestamp = Long.parseLong(parser.get("timestamp"));
final String host = parser.get("indexHost");
final String index = parser.get("indexName");
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsuri);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
FileSystem.get(URI.create(hdfsuri), conf);
final Datacite2Scholix d2s = new Datacite2Scholix(RelationMapper.load());
final ObjectMapper mapper = new ObjectMapper();
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(hdfswritepath),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class))) {
final Text value = new Text();
final IntWritable key = new IntWritable();
int i = 0;
for (String dataset : new DataciteClient(host).getDatasetsFromTs(timestamp)) {
i++;
List<Scholix> scholix = d2s.generateScholixFromJson(dataset);
if (scholix != null)
for (Scholix s : scholix) {
key.set(i);
value.set(mapper.writeValueAsString(s));
writer.append(key, value);
if (i % 10000 == 0) {
System.out.println("wrote " + i);
}
}
}
}
}
}

@ -1,184 +0,0 @@
package eu.dnetlib.dhp.provision.update;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.provision.scholix.Scholix;
import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier;
import eu.dnetlib.dhp.provision.scholix.ScholixRelationship;
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class SparkResolveScholixTarget {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkResolveScholixTarget.class
.getResourceAsStream(
"/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json")));
parser.parseArgument(args);
final SparkConf conf = new SparkConf();
final String master = parser.get("master");
final String sourcePath = parser.get("sourcePath");
final String workingDirPath = parser.get("workingDirPath");
final String indexHost = parser.get("indexHost");
try (SparkSession spark = getSession(conf, master)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
spark
.createDataset(
sc
.sequenceFile(sourcePath, IntWritable.class, Text.class)
.map(Tuple2::_2)
.map(s -> new ObjectMapper().readValue(s.toString(), Scholix.class))
.rdd(),
Encoders.bean(Scholix.class))
.write()
.save(workingDirPath + "/stepA");
Dataset<Scholix> s1 = spark.read().load(workingDirPath + "/stepA").as(Encoders.bean(Scholix.class));
s1
.where(s1.col("target.dnetIdentifier").isNull())
.select(s1.col("target.identifier"))
.distinct()
.map(
(MapFunction<Row, ScholixResource>) f -> {
final String pid = ((Row) f.getList(0).get(0)).getString(0);
ScholixResource publication = new CrossrefClient(indexHost).getResourceByDOI(pid);
if (publication != null) {
return publication;
}
ScholixResource dataset = new DataciteClient(indexHost).getDatasetByDOI(pid);
if (dataset != null) {
return dataset;
}
ScholixResource r = new ScholixResource();
r.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, "doi")));
r.setObjectType("unknown");
r
.setDnetIdentifier(
"70|" + DHPUtils.md5(String.format("%s::doi", pid.toLowerCase().trim())));
return r;
},
Encoders.bean(ScholixResource.class))
.write()
.mode(SaveMode.Overwrite)
.save(workingDirPath + "/stepB");
Dataset<ScholixResource> s2 = spark
.read()
.load(workingDirPath + "/stepB")
.as(Encoders.bean(ScholixResource.class));
s1
.joinWith(
s2,
s1.col("target.identifier.identifier").equalTo(s2.col("identifier.identifier")),
"left")
.flatMap(
(FlatMapFunction<Tuple2<Scholix, ScholixResource>, Scholix>) f -> {
final List<Scholix> res = new ArrayList<>();
final Scholix s = f._1();
final ScholixResource target = f._2();
if (StringUtils.isNotBlank(s.getIdentifier()))
res.add(s);
else if (target == null) {
ScholixResource currentTarget = s.getTarget();
currentTarget.setObjectType("unknown");
currentTarget
.setDnetIdentifier(
Datacite2Scholix
.generateId(
currentTarget.getIdentifier().get(0).getIdentifier(),
currentTarget.getIdentifier().get(0).getSchema(),
currentTarget.getObjectType()));
s.generateIdentifier();
res.add(s);
final Scholix inverse = new Scholix();
inverse.setTarget(s.getSource());
inverse.setSource(s.getTarget());
inverse.setLinkprovider(s.getLinkprovider());
inverse.setPublicationDate(s.getPublicationDate());
inverse.setPublisher(s.getPublisher());
inverse
.setRelationship(
new ScholixRelationship(
s.getRelationship().getInverse(),
s.getRelationship().getSchema(),
s.getRelationship().getName()));
inverse.generateIdentifier();
res.add(inverse);
} else {
target
.setIdentifier(
target
.getIdentifier()
.stream()
.map(
d -> new ScholixIdentifier(
d.getIdentifier().toLowerCase(),
d.getSchema().toLowerCase()))
.collect(Collectors.toList()));
s.setTarget(target);
s.generateIdentifier();
res.add(s);
final Scholix inverse = new Scholix();
inverse.setTarget(s.getSource());
inverse.setSource(s.getTarget());
inverse.setLinkprovider(s.getLinkprovider());
inverse.setPublicationDate(s.getPublicationDate());
inverse.setPublisher(s.getPublisher());
inverse
.setRelationship(
new ScholixRelationship(
s.getRelationship().getInverse(),
s.getRelationship().getSchema(),
s.getRelationship().getName()));
inverse.generateIdentifier();
res.add(inverse);
}
return res.iterator();
},
Encoders.bean(Scholix.class))
.javaRDD()
.map(s -> new ObjectMapper().writeValueAsString(s))
.saveAsTextFile(workingDirPath + "/resolved_json");
}
}
private static SparkSession getSession(SparkConf conf, String master) {
return SparkSession
.builder()
.config(conf)
.appName(SparkResolveScholixTarget.class.getSimpleName())
.master(master)
.getOrCreate();
}
}

@ -1,14 +0,0 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirPath",
"paramDescription": "the working path where generated files",
"paramRequired": true
}
]

@ -1,20 +0,0 @@
[
{
"paramName": "n",
"paramLongName": "nameNode",
"paramDescription": "the Name Node",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source path",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the target path",
"paramRequired": true
}
]

@ -1,45 +0,0 @@
[
{
"paramName":"nd",
"paramLongName":"newDeposition",
"paramDescription": "if it is a new deposition (true) or a new version (false)",
"paramRequired": true
},
{
"paramName":"cri",
"paramLongName":"conceptRecordId",
"paramDescription": "The id of the concept record for a new version",
"paramRequired": false
},
{
"paramName":"hdfsp",
"paramLongName":"hdfsPath",
"paramDescription": "the path of the folder tofind files to send to Zenodo",
"paramRequired": true
},
{
"paramName": "nn",
"paramLongName": "nameNode",
"paramDescription": "the name node",
"paramRequired": true
},
{
"paramName": "at",
"paramLongName": "accessToken",
"paramDescription": "the access token for the deposition",
"paramRequired": false
},
{
"paramName":"cu",
"paramLongName":"connectionUrl",
"paramDescription": "the url to connect to deposit",
"paramRequired": false
},
{
"paramName":"m",
"paramLongName":"metadata",
"paramDescription": "metadata associated to the deposition",
"paramRequired": false
}
]

@ -1,4 +0,0 @@
{
"cluster1": "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54",
"cluster2": "10.19.65.55, 10.19.65.56, 10.19.65.57, 10.19.65.58"
}

@ -1,14 +0,0 @@
[
{
"paramName": "m",
"paramLongName": "master",
"paramDescription": "master should be local or yarn",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "the working path",
"paramRequired": true
}
]

@ -1,14 +0,0 @@
[
{
"paramName": "c",
"paramLongName": "cluster",
"paramDescription": "should be cluster1 or cluster2",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "index",
"paramDescription": "index name",
"paramRequired": true
}
]

@ -1,33 +0,0 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the working path where generated files",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "index",
"paramDescription": "the index name",
"paramRequired": true
},
{
"paramName": "c",
"paramLongName": "cluster",
"paramDescription": "the index cluster",
"paramRequired": true
},
{
"paramName": "id",
"paramLongName": "idPath",
"paramDescription": "the identifier field name",
"paramRequired": true
}
]

@ -1,20 +0,0 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirPath",
"paramDescription": "the working path where generated files",
"paramRequired": true
},
{
"paramName": "g",
"paramLongName": "graphPath",
"paramDescription": "the relationPath path ",
"paramRequired": true
}
]

@ -1,20 +0,0 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirPath",
"paramDescription": "the working path where generated files",
"paramRequired": true
},
{
"paramName": "r",
"paramLongName": "relationPath",
"paramDescription": "the relationPath path ",
"paramRequired": true
}
]

@ -1,26 +0,0 @@
[
{
"paramName": "m",
"paramLongName": "master",
"paramDescription": "the name node",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source path",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirPath",
"paramDescription": "the working Dir Path",
"paramRequired": true
},
{
"paramName": "h",
"paramLongName": "indexHost",
"paramDescription": "the working Dir Path",
"paramRequired": true
}
]

@ -1,33 +0,0 @@
[
{
"paramName": "n",
"paramLongName": "namenode",
"paramDescription": "the name node",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the working path where generated files",
"paramRequired": true
},
{
"paramName": "ts",
"paramLongName": "timestamp",
"paramDescription": "the timestamp for incremental harvesting",
"paramRequired": true
},
{
"paramName": "ih",
"paramLongName": "indexHost",
"paramDescription": "the ip name of the index",
"paramRequired": true
},
{
"paramName": "in",
"paramLongName": "indexName",
"paramDescription": "the name of the index",
"paramRequired": true
}
]

@ -1,331 +0,0 @@
{
"mappings": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"linkprovider": {
"type": "nested",
"properties": {
"identifiers": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"name": {
"type": "keyword"
}
}
},
"publicationDate": {
"type": "keyword"
},
"relationship": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"source": {
"type": "nested",
"properties": {
"collectedFrom": {
"properties": {
"completionStatus": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"provider": {
"properties": {
"identifiers": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"provisionMode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"creator": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"dnetIdentifier": {
"type": "keyword"
},
"identifier": {
"type": "nested",
"properties": {
"identifier": {
"type": "keyword"
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"type": {
"type": "keyword"
}
}
},
"objectType": {
"type": "keyword"
},
"publicationDate": {
"type": "keyword"
},
"publisher": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"target": {
"type": "nested",
"properties": {
"collectedFrom": {
"properties": {
"completionStatus": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"provider": {
"properties": {
"identifiers": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"provisionMode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"creator": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"dnetIdentifier": {
"type": "keyword"
},
"identifier": {
"type": "nested",
"properties": {
"identifier": {
"type": "keyword"
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"type": {
"type": "keyword"
}
}
},
"objectType": {
"type": "keyword"
},
"publicationDate": {
"type": "keyword"
},
"publisher": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
},
"settings": {
"index": {
"refresh_interval": "600s",
"number_of_shards": "48",
"translog": {
"sync_interval": "15s",
"durability": "ASYNC"
},
"analysis": {
"analyzer": {
"analyzer_keyword": {
"filter": "lowercase",
"tokenizer": "keyword"
}
}
},
"number_of_replicas": "0"
}
}
}

@ -1,132 +0,0 @@
{
"mappings": {
"properties": {
"abstract": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"author": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"datasources": {
"type": "nested",
"properties": {
"completionStatus": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"datasourceId": {
"type": "keyword"
},
"datasourceName": {
"type": "keyword"
}
}
},
"date": {
"type": "keyword"
},
"id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"localIdentifier": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"type": {
"type": "keyword"
}
}
},
"publisher": {
"type": "keyword"
},
"relatedDatasets": {
"type": "long"
},
"relatedPublications": {
"type": "long"
},
"relatedUnknown": {
"type": "long"
},
"subject": {
"properties": {
"scheme": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"typology": {
"type": "keyword"
}
}
},
"settings": {
"index": {
"refresh_interval": "600s",
"number_of_shards": "48",
"translog": {
"sync_interval": "15s",
"durability": "ASYNC"
},
"analysis": {
"analyzer": {
"analyzer_keyword": {
"filter": "lowercase",
"tokenizer": "keyword"
}
}
},
"number_of_replicas": "0"
}
}
}

@ -1,42 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.wf.rerun.failnodes</name>
<value>false</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
</configuration>

@ -1,49 +0,0 @@
<workflow-app name="Export Scholexplorer Graph to OpenAIRE" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingDirPath</name>
<description>the source path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>memory for individual executor</description>
</property>
</parameters>
<start to="ExtractOAF"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ExtractOAF">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractOAF</name>
<class>eu.dnetlib.dhp.export.SparkExportContentForOpenAire</class>
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -1,14 +0,0 @@
<configuration>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

@ -1,86 +0,0 @@
<workflow-app name="Materialize and Index graph to ElasticSearch" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingDirPath</name>
<description>the source path</description>
</property>
<property>
<name>index</name>
<description>the index name</description>
</property>
<property>
<name>esCluster</name>
<description>the Index cluster</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
</parameters>
<start to="DropAndCreateIndex"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="DropAndCreateIndex">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.provision.DropAndCreateESIndex</main-class>
<arg>-i</arg><arg>${index}</arg>
<arg>-c</arg><arg>${esCluster}</arg>
</java>
<ok to="indexSummary"/>
<error to="Kill"/>
</action>
<action name="indexSummary">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>index summary</name>
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingDirPath}/summary_json</arg>
<arg>--index</arg><arg>${index}_object</arg>
<arg>--idPath</arg><arg>id</arg>
<arg>--cluster</arg><arg>${esCluster}</arg>
</spark>
<ok to="indexScholix"/>
<error to="Kill"/>
</action>
<action name="indexScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>index scholix</name>
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
<arg>--index</arg><arg>${index}_scholix</arg>
<arg>--idPath</arg><arg>identifier</arg>
<arg>--cluster</arg><arg>${esCluster}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -1,14 +0,0 @@
<configuration>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

@ -1,116 +0,0 @@
<workflow-app name="Materialize and Index graph to ElasticSearch" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingDirPath</name>
<description>the source path</description>
</property>
<property>
<name>graphPath</name>
<description>the graph path</description>
</property>
<property>
<name>index</name>
<description>the index name</description>
</property>
<property>
<name>esCluster</name>
<description>the Index cluster</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
</parameters>
<start to="DeleteTargetPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="DeleteTargetPath">
<fs>
<delete path='${workingDirPath}'/>
<mkdir path='${workingDirPath}'/>
</fs>
<ok to="CalculateRelatedItem"/>
<error to="Kill"/>
</action>
<action name="CalculateRelatedItem">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>calculate for each ID the number of related Dataset, publication and Unknown</name>
<class>eu.dnetlib.dhp.provision.SparkExtractRelationCount</class>
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--relationPath</arg><arg>${graphPath}/relation</arg>
</spark>
<ok to="generateSummary"/>
<error to="Kill"/>
</action>
<action name="generateSummary">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>generate Summary</name>
<class>eu.dnetlib.dhp.provision.SparkGenerateSummaryIndex</class>
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.sql.shuffle.partitions=4000 ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--graphPath</arg><arg>${graphPath}</arg>
</spark>
<ok to="generateScholix"/>
<error to="Kill"/>
</action>
<action name="generateScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>generate Scholix</name>
<class>eu.dnetlib.dhp.provision.SparkGenerateScholixIndex</class>
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.sql.shuffle.partitions=4000 ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--graphPath</arg><arg>${graphPath}</arg>
</spark>
<ok to="datasetToJson"/>
<error to="Kill"/>
</action>
<action name="datasetToJson">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>generate Scholix</name>
<class>eu.dnetlib.dhp.provision.SparkConvertDatasetToJson</class>
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.sql.shuffle.partitions=4000 ${sparkExtraOPT}</spark-opts>
<arg>-m</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingDirPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -1,14 +0,0 @@
<configuration>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

@ -1,97 +0,0 @@
<workflow-app name="Keep On Synch datacite" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingDirPath</name>
<description>the source path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>index</name>
<description>index name</description>
</property>
<property>
<name>timestamp</name>
<description>timestamp from incremental harvesting</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingDirPath}/synch'/>
<mkdir path='${workingDirPath}/synch'/>
</fs>
<ok to="ImportDataciteUpdate"/>
<error to="Kill"/>
</action>
<action name="ImportDataciteUpdate">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.provision.update.RetrieveUpdateFromDatacite</main-class>
<arg>-t</arg><arg>${workingDirPath}/synch/input_json</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-ts</arg><arg>${timestamp}</arg>
<arg>-ih</arg><arg>ip-90-147-167-25.ct1.garrservices.it</arg>
<arg>-in</arg><arg>datacite</arg>
</java>
<ok to="resolveScholix"/>
<error to="Kill"/>
</action>
<action name="resolveScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>resolve and generate Scholix</name>
<class>eu.dnetlib.dhp.provision.update.SparkResolveScholixTarget</class>
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="32" </spark-opts>
<arg>-m</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${workingDirPath}/synch/input_json</arg>
<arg>-w</arg><arg>${workingDirPath}/synch</arg>
<arg>-h</arg><arg>ip-90-147-167-25.ct1.garrservices.it</arg>
</spark>
<ok to="indexScholix"/>
<error to="Kill"/>
</action>
<action name="indexScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>index scholix</name>
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingDirPath}/synch/resolved_json</arg>
<arg>--index</arg><arg>${index}_scholix</arg>
<arg>--idPath</arg><arg>identifier</arg>
<arg>--type</arg><arg>scholix</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -1,48 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.wf.rerun.failnodes</name>
<value>false</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

@ -1,53 +0,0 @@
<workflow-app name="Send Dump to Zenodo" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>targetPath</name>
<description>the target path</description>
</property>
<!-- <property>-->
<!-- <name>metadata</name>-->
<!-- <description>the metadata</description>-->
<!-- </property>-->
</parameters>
<start to="send_zenodo"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="MakeTar">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.export.zenodo.MakeTar</main-class>
<arg>-t</arg><arg>${targetPath}</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-s</arg><arg>${sourcePath}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<!-- <action name="send_zenodo">-->
<!-- <java>-->
<!-- <main-class>eu.dnetlib.dhp.export.zenodo.SendToZenodoHDFS</main-class>-->
<!-- <arg>&#45;&#45;hdfsPath</arg><arg>/user/dnet.scholexplorer/scholix/provision/scholix.tar/scholix-2020-10-16.tar</arg>-->
<!-- <arg>&#45;&#45;nameNode</arg><arg>${nameNode}</arg>-->
<!-- <arg>&#45;&#45;accessToken</arg><arg>b6ddrY6b77WxcDEevn9gqVE5sL5sDNjdUijt75W3o7cQo5vpFFI48dMiu8Gv</arg>-->
<!-- <arg>&#45;&#45;connectionUrl</arg><arg>https://zenodo.org/api/deposit/depositions</arg>-->
<!-- <arg>&#45;&#45;metadata</arg><arg>${metadata}</arg>-->
<!-- <arg>&#45;&#45;conceptRecordId</arg><arg>1200252</arg>-->
<!-- <arg>&#45;&#45;newDeposition</arg><arg>false</arg>-->
<!-- </java>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<end name="End"/>
</workflow-app>

@ -1,102 +0,0 @@
package eu.dnetlib.dhp.export
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import eu.dnetlib.dhp.provision.scholix.Scholix
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Test
import scala.io.Source
import scala.collection.JavaConverters._
class ExportDLITOOAFTest {
val mapper = new ObjectMapper()
@Test
def testDate():Unit = {
println(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
}
def extractDatasources(s:Scholix):List[String]= {
s.getTarget.getCollectedFrom.asScala.map(c => c.getProvider.getName)(collection.breakOut)
}
def extractDatasources(s:ScholixSummary):List[String] = {
s.getDatasources.asScala.map(c => c.getDatasourceName)(collection.breakOut)
}
@Test
def testMappingRele():Unit = {
val r:Relation = new Relation
r.setSource("60|fbff1d424e045eecf24151a5fe3aa738")
r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877")
r.setRelType("IsReferencedBy")
val r1 =DLIToOAF.convertDLIRelation(r)
println(r1.getSource, r1.getTarget)
}
@Test
def testPublicationMapping():Unit = {
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
val json = Source.fromInputStream(getClass.getResourceAsStream("publication.json")).mkString
val oaf =DLIToOAF.convertDLIPublicationToOAF(mapper.readValue(json, classOf[DLIPublication]))
println(mapper.writeValueAsString(oaf))
}
@Test
def testExternalReferenceMapping():Unit = {
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
val json = Source.fromInputStream(getClass.getResourceAsStream("dataset.json")).mkString
val oaf =DLIToOAF.convertDLIDatasetToExternalReference(mapper.readValue(json, classOf[DLIDataset]))
println(oaf)
}
@Test
def testRelationMapping():Unit = {
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
val json = Source.fromInputStream(getClass.getResourceAsStream("relation.json")).mkString
val oaf =mapper.readValue(json, classOf[Relation])
println(mapper.writeValueAsString(oaf))
}
}

@ -1,50 +0,0 @@
package eu.dnetlib.dhp.provision;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.provision.scholix.Scholix;
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
import eu.dnetlib.dhp.provision.update.*;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
public class DataciteClientTest {
@Test
public void dataciteSCholixTest() throws Exception {
final String json = IOUtils.toString(getClass().getResourceAsStream("datacite.json"));
final RelationMapper mapper = RelationMapper.load();
Datacite2Scholix ds = new Datacite2Scholix(mapper);
final List<Scholix> s = ds.generateScholixFromJson(json);
System.out.println(new ObjectMapper().writeValueAsString(s));
}
// public void testS() throws Exception {
// RetrieveUpdateFromDatacite.main(new String[]{
// "-n", "file:///data/new_s2.txt",
// "-t", "/data/new_s2.txt",
// "-ts", "1586974078",
// "-ih", "ip-90-147-167-25.ct1.garrservices.it",
// "-in", "datacite",
// });
//
// }
public void testResolveDataset() throws Exception {
DataciteClient dc = new DataciteClient("ip-90-147-167-25.ct1.garrservices.it");
ScholixResource datasetByDOI = dc.getDatasetByDOI("10.17182/hepdata.15392.v1/t5");
Assertions.assertNotNull(datasetByDOI);
System.out.println(new ObjectMapper().writeValueAsString(datasetByDOI));
CrossrefClient cr = new CrossrefClient("ip-90-147-167-25.ct1.garrservices.it");
ScholixResource crossrefByDOI = cr.getResourceByDOI("10.26850/1678-4618eqj.v35.1.2010.p41-46");
Assertions.assertNotNull(crossrefByDOI);
System.out.println(new ObjectMapper().writeValueAsString(crossrefByDOI));
}
}

@ -1,13 +0,0 @@
package eu.dnetlib.dhp.provision;
import org.junit.jupiter.api.Test;
public class DropAndCreateESIndexTest {
public void testDropAndCreate() throws Exception {
DropAndCreateESIndex.main("-c localhost -i dli_shadow".split(" "));
}
}

@ -1,30 +0,0 @@
package eu.dnetlib.dhp.provision;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.provision.scholix.Scholix;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
public class ExtractInfoTest {
@Test
public void testSerialization() throws Exception {
ScholixSummary summary = new ScholixSummary();
summary.setDescription("descrizione");
ObjectMapper mapper = new ObjectMapper();
String json = mapper.writeValueAsString(summary);
System.out.println(json);
System.out.println(mapper.readValue(json, ScholixSummary.class).getDescription());
}
@Test
public void testScholix() throws Exception {
final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json"));
final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json"));
Scholix.generateScholixWithSource(jsonSummary, jsonRelation);
}
}

@ -1,101 +0,0 @@
{
"dataInfo": {
"invisible": false,
"inferred": null,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": null
},
"lastupdatetimestamp": null,
"id": "60|719f19e5a996de1b87cddf93871bf2d4",
"originalId": [
"a0a3p2gws9::uniprot"
],
"collectedfrom": [
{
"key": "dli_________::europe_pmc__",
"value": "Europe PMC",
"dataInfo": null
}
],
"pid": [
{
"value": "acc63471",
"qualifier": {
"classid": "ena",
"classname": "ena",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": null
}
],
"dateofcollection": "2019-07-05T12:47:11.545+02:00",
"dateoftransformation": null,
"extraInfo": null,
"oaiprovenance": null,
"author": null,
"resulttype": {
"classid": "dataset",
"classname": "dataset",
"schemeid": "dataset",
"schemename": "dataset"
},
"language": null,
"country": null,
"subject": [],
"title": [
{
"value": "CMD domain-containing protein",
"qualifier": null,
"dataInfo": null
}
],
"relevantdate": [
{
"value": "2019-07-15T16:14:28.636",
"qualifier": {
"classid": "resolvedDate",
"classname": "resolvedDate",
"schemeid": "dnet::date",
"schemename": "dnet::date"
},
"dataInfo": null
}
],
"description": null,
"dateofacceptance": null,
"publisher": {
"value": "UniProt",
"dataInfo": null
},
"embargoenddate": null,
"source": null,
"fulltext": null,
"format": null,
"contributor": null,
"resourcetype": null,
"coverage": null,
"bestaccessright": null,
"context": null,
"externalReference": null,
"instance": [],
"storagedate": null,
"device": null,
"size": null,
"version": null,
"lastmetadataupdate": null,
"metadataversionnumber": null,
"geolocation": null,
"originalObjIdentifier": "europe_pmc__::719f19e5a996de1b87cddf93871bf2d4",
"dlicollectedfrom": [
{
"id": "dli_________::europe_pmc__",
"name": "Europe PMC",
"completionStatus": "complete",
"collectionMode": null
}
],
"completionStatus": "complete"
}

@ -1,128 +0,0 @@
{
"dataInfo": {
"invisible": false,
"inferred": null,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": null
},
"lastupdatetimestamp": null,
"id": "50|9e117414be07bf03cbce8889d22d661a",
"originalId": [
"9e117414be07bf03cbce8889d22d661a"
],
"collectedfrom": [
{
"key": "dli_________::crossref",
"value": "Crossref",
"dataInfo": null
}
],
"pid": [
{
"value": "10.1007/978-94-017-3490-5_15",
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": null
}
],
"dateofcollection": "2020-06-08T07:28:55.731Z",
"dateoftransformation": null,
"extraInfo": null,
"oaiprovenance": null,
"author": [
{
"fullname": "Calcaterra Domenico",
"name": null,
"surname": null,
"rank": null,
"pid": null,
"affiliation": null
},
{
"fullname": "Parise Mario",
"name": null,
"surname": null,
"rank": null,
"pid": null,
"affiliation": null
}
],
"resulttype": {
"classid": "publication",
"classname": "publication",
"schemeid": "publication",
"schemename": "publication"
},
"language": null,
"country": null,
"subject":[
{
"value":"Strain-linked information about bacterial and archaeal biodiversity",
"qualifier":{
"classid":"dnet:subject",
"classname":"dnet:subject",
"schemeid":"",
"schemename":""
},
"dataInfo":null
}
],
"title": [
{
"value": "The Contribution of Historical Information in the Assessment of Landslide Hazard",
"qualifier": null,
"dataInfo": null
}
],
"relevantdate": [
{
"value": "2013-01-29T16:50:44Z",
"qualifier": {
"classid": "date",
"classname": "date",
"schemeid": "dnet::date",
"schemename": "dnet::date"
},
"dataInfo": null
}
],
"description": [
{
"value": null,
"dataInfo": null
}
],
"dateofacceptance": null,
"publisher": {
"value": "Springer Netherlands",
"dataInfo": null
},
"embargoenddate": null,
"source": null,
"fulltext": null,
"format": null,
"contributor": null,
"resourcetype": null,
"coverage": null,
"bestaccessright": null,
"context": null,
"externalReference": null,
"instance": [],
"journal": null,
"originalObjIdentifier": "dli_resolver::9e117414be07bf03cbce8889d22d661a",
"dlicollectedfrom": [
{
"id": "dli_________::crossref",
"name": "Crossref",
"completionStatus": "complete",
"collectionMode": "resolved"
}
],
"completionStatus": "complete"
}

@ -1,23 +0,0 @@
{
"subRelType": null,
"relClass": "datacite",
"dataInfo": {
"deletedbyinference": false,
"provenanceaction": null,
"inferred": null,
"inferenceprovenance": null,
"invisible": false,
"trust": "0.9"
},
"target": "50|00062410e2a15322480277d063c181bb",
"lastupdatetimestamp": null,
"relType": "IsReferencedBy",
"source": "60|4ee78ab329b49416b45c3774c132f244",
"collectedfrom": [
{
"dataInfo": null,
"value": "Europe PMC",
"key": "dli_________::europe_pmc__"
}
]
}

@ -1,136 +0,0 @@
{
"relationships": {
"client": {
"data": {
"type": "clients",
"id": "crossref.citations"
}
}
},
"attributes": {
"contributors": [
],
"titles": [
{
"title": "UV-visible spectroscopy in the interpretation of the tautomeric equilibrium of N,N(bis-3,5-di-bromo-salicyliden)-1,2-diaminobenzene and the redox activity of its Co(II) complex. A quantum chemical approach."
}
],
"descriptions": [
],
"referenceCount": 0,
"subjects": [
],
"container": {
"title": "Journal of Molecular Structure: THEOCHEM",
"firstPage": "97",
"volume": "367",
"lastPage": "110",
"identifierType": "ISSN",
"identifier": "0166-1280",
"type": "Journal"
},
"state": "findable",
"created": "2020-03-26T13:31:57.000Z",
"source": "levriero",
"metadataVersion": 0,
"version": null,
"isActive": true,
"contentUrl": null,
"geoLocations": [
],
"updated": "2020-03-26T13:31:58.000Z",
"fundingReferences": [
],
"viewCount": 0,
"registered": "2020-03-26T13:31:58.000Z",
"published": "1996",
"dates": [
{
"date": "1996-09",
"dateType": "Issued"
},
{
"date": "2019-04-17T13:58:25Z",
"dateType": "Updated"
}
],
"relatedIdentifiers": [
{
"relationType": "IsPartOf",
"relatedIdentifier": "0166-1280",
"relatedIdentifierType": "ISSN",
"resourceTypeGeneral": "Collection"
}
],
"reason": null,
"rightsList": [
{
"rightsUri": "https://www.elsevier.com/tdm/userlicense/1.0"
}
],
"schemaVersion": "http://datacite.org/schema/kernel-4",
"types": {
"resourceType": "JournalArticle",
"ris": "JOUR",
"resourceTypeGeneral": "Text",
"bibtex": "article",
"citeproc": "article-journal",
"schemaOrg": "ScholarlyArticle"
},
"publisher": "Elsevier BV",
"publicationYear": 1996,
"doi": "10.1016/s0166-1280(96)04575-7",
"language": null,
"sizes": [
],
"url": "https://linkinghub.elsevier.com/retrieve/pii/S0166128096045757",
"identifiers": [
{
"identifier": "https://doi.org/10.1016/s0166-1280(96)04575-7",
"identifierType": "DOI"
},
{
"identifier": "S0166128096045757",
"identifierType": "Publisher ID"
}
],
"citationCount": 0,
"formats": [
],
"downloadCount": 0,
"creators": [
{
"nameType": "Personal",
"givenName": "G.L.",
"name": "Estiú, G.L.",
"familyName": "Estiú",
"affiliation": [
]
},
{
"nameType": "Personal",
"givenName": "A.H.",
"name": "Jubert, A.H.",
"familyName": "Jubert",
"affiliation": [
]
},
{
"nameType": "Personal",
"givenName": "J.",
"name": "Costamagna, J.",
"familyName": "Costamagna",
"affiliation": [
]
},
{
"nameType": "Personal",
"givenName": "J.",
"name": "Vargas, J.",
"familyName": "Vargas",
"affiliation": [
]
}
]
}
}

@ -1 +0,0 @@
{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"references","subRelType":null,"relClass":"datacite","source":"50|f2123fce7e56c73dc8f1bf64ec59b477","target":"50|b618cbe39ba940a29993ac324e5f9621","collectedFrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}]}

@ -1 +0,0 @@
{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"IsReferencedBy","subRelType":null,"relClass":"datacite","source":"50|dedup_______::4f00e4f0e82bb4cbb35261478e55568e","target":"60|97519e00ee2cddfa1f5bcb5220429b8f","collectedfrom":[{"key":"dli_________::europe_pmc__","value":"Europe PMC","dataInfo":null}]}

@ -1 +0,0 @@
{"id":"50|dedup_______::4f00e4f0e82bb4cbb35261478e55568e","localIdentifier":[{"id":"16909284","type":"pbmid"},{"id":"10.1007/s00438-006-0155-3","type":"doi"}],"typology":"publication","title":["Effects of the Sabin-like mutations in domain V of the internal ribosome entry segment on translational efficiency of the Coxsackievirus B3.","Effects of the Sabin-like mutations in domain V of the internal ribosome entry segment on translational efficiency of the Coxsackievirus B3"],"author":["Ben Mhadheb-Gharbi Manel","Gharbi Jawhar","Paulous Sylvie","Brocard Michèle","Komaromva Anastasia","Aouni Mahjoub","M. Kean Katherine"],"date":[null,"2018-11-13","2006-08-14T15:43:22Z"],"subject":[],"publisher":null,"relatedPublications":1,"relatedDatasets":4,"relatedUnknown":0,"datasources":null,"abstract":"The domain V within the internal ribosome entry segment (IRES) of poliovirus (PV) is expected to be important in its own neurovirulence because it contains an attenuating mutation in each of the Sabin vaccine strains. In this study, we try to find out if the results observed in the case of Sabin vaccine strains of PV can be extrapolated to another virus belonging to the same genus of enteroviruses but with a different tropism. To test this hypothesis, we used the coxsackievirus B3 (CVB3), known to be the mo"}
Loading…
Cancel
Save