deleted old scholix project
parent
8535506c22
commit
ed684874f2
@ -1,82 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-dedup-scholexplorer</artifactId>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-pace-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-graphx_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
@ -1,121 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup;
|
||||
|
||||
import static java.util.Collections.reverseOrder;
|
||||
import static java.util.Map.Entry.comparingByValue;
|
||||
import static java.util.stream.Collectors.toMap;
|
||||
|
||||
import static org.apache.commons.lang.StringUtils.endsWith;
|
||||
import static org.apache.commons.lang.StringUtils.substringBefore;
|
||||
|
||||
import java.time.Year;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
|
||||
public class DatePicker {
|
||||
|
||||
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
|
||||
private static final String DATE_DEFAULT_SUFFIX = "01-01";
|
||||
private static final int YEAR_LB = 1300;
|
||||
private static final int YEAR_UB = Year.now().getValue() + 5;
|
||||
|
||||
public static Field<String> pick(final Collection<String> dateofacceptance) {
|
||||
|
||||
final Map<String, Integer> frequencies = dateofacceptance
|
||||
.parallelStream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum));
|
||||
|
||||
if (frequencies.isEmpty()) {
|
||||
return new Field<>();
|
||||
}
|
||||
|
||||
final Field<String> date = new Field<>();
|
||||
date.setValue(frequencies.keySet().iterator().next());
|
||||
|
||||
// let's sort this map by values first, filtering out invalid dates
|
||||
final Map<String, Integer> sorted = frequencies
|
||||
.entrySet()
|
||||
.stream()
|
||||
.filter(d -> StringUtils.isNotBlank(d.getKey()))
|
||||
.filter(d -> d.getKey().matches(DATE_PATTERN))
|
||||
.filter(d -> inRange(d.getKey()))
|
||||
.sorted(reverseOrder(comparingByValue()))
|
||||
.collect(
|
||||
toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
|
||||
|
||||
// shortcut
|
||||
if (sorted.size() == 0) {
|
||||
return date;
|
||||
}
|
||||
|
||||
// voting method (1/3 + 1) wins
|
||||
if (sorted.size() >= 3) {
|
||||
final int acceptThreshold = (sorted.size() / 3) + 1;
|
||||
final List<String> accepted = sorted
|
||||
.entrySet()
|
||||
.stream()
|
||||
.filter(e -> e.getValue() >= acceptThreshold)
|
||||
.map(e -> e.getKey())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// cannot find strong majority
|
||||
if (accepted.isEmpty()) {
|
||||
final int max = sorted.values().iterator().next();
|
||||
Optional<String> first = sorted
|
||||
.entrySet()
|
||||
.stream()
|
||||
.filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
|
||||
.map(Map.Entry::getKey)
|
||||
.findFirst();
|
||||
if (first.isPresent()) {
|
||||
date.setValue(first.get());
|
||||
return date;
|
||||
}
|
||||
|
||||
date.setValue(sorted.keySet().iterator().next());
|
||||
return date;
|
||||
}
|
||||
|
||||
if (accepted.size() == 1) {
|
||||
date.setValue(accepted.get(0));
|
||||
return date;
|
||||
} else {
|
||||
final Optional<String> first = accepted
|
||||
.stream()
|
||||
.filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX))
|
||||
.findFirst();
|
||||
if (first.isPresent()) {
|
||||
date.setValue(first.get());
|
||||
return date;
|
||||
}
|
||||
|
||||
return date;
|
||||
}
|
||||
|
||||
// 1st non YYYY-01-01 is returned
|
||||
} else {
|
||||
if (sorted.size() == 2) {
|
||||
for (Map.Entry<String, Integer> e : sorted.entrySet()) {
|
||||
if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
|
||||
date.setValue(e.getKey());
|
||||
return date;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// none of the dates seems good enough, return the 1st one
|
||||
date.setValue(sorted.keySet().iterator().next());
|
||||
return date;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean inRange(final String date) {
|
||||
final int year = Integer.parseInt(substringBefore(date, "-"));
|
||||
return year >= YEAR_LB && year <= YEAR_UB;
|
||||
}
|
||||
}
|
@ -1,327 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class DedupRecordFactory {
|
||||
|
||||
public static JavaRDD<OafEntity> createDedupRecord(
|
||||
final JavaSparkContext sc,
|
||||
final SparkSession spark,
|
||||
final String mergeRelsInputPath,
|
||||
final String entitiesInputPath,
|
||||
final OafEntityType entityType,
|
||||
final DedupConfig dedupConf) {
|
||||
long ts = System.currentTimeMillis();
|
||||
// <id, json_entity>
|
||||
final JavaPairRDD<String, String> inputJsonEntities = spark
|
||||
.read()
|
||||
.load(entitiesInputPath)
|
||||
.as(Encoders.kryo(Oaf.class))
|
||||
.map(
|
||||
(MapFunction<Oaf, String>) p -> new org.codehaus.jackson.map.ObjectMapper().writeValueAsString(p),
|
||||
Encoders.STRING())
|
||||
.javaRDD()
|
||||
.mapToPair(
|
||||
(PairFunction<String, String, String>) it -> new Tuple2<>(
|
||||
MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it));
|
||||
|
||||
// <source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||
JavaPairRDD<String, String> mergeRels = spark
|
||||
.read()
|
||||
.load(mergeRelsInputPath)
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.where("relClass=='merges'")
|
||||
.javaRDD()
|
||||
.mapToPair(
|
||||
(PairFunction<Relation, String, String>) r -> new Tuple2<String, String>(r.getTarget(), r.getSource()));
|
||||
|
||||
// <dedup_id, json_entity_merged>
|
||||
final JavaPairRDD<String, String> joinResult = mergeRels
|
||||
.join(inputJsonEntities)
|
||||
.mapToPair(
|
||||
(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||
|
||||
JavaPairRDD<String, Iterable<String>> sortedJoinResult = joinResult.groupByKey();
|
||||
|
||||
switch (entityType) {
|
||||
case publication:
|
||||
return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts));
|
||||
case dataset:
|
||||
return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts));
|
||||
case project:
|
||||
return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts));
|
||||
case software:
|
||||
return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts));
|
||||
case datasource:
|
||||
return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts));
|
||||
case organization:
|
||||
return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts));
|
||||
case otherresearchproduct:
|
||||
return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts));
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static DLIPublication publicationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||
|
||||
DLIPublication p = new DLIPublication(); // the result of the merge, to be returned at the end
|
||||
|
||||
p.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||
|
||||
if (e._2() != null)
|
||||
e
|
||||
._2()
|
||||
.forEach(
|
||||
pub -> {
|
||||
try {
|
||||
DLIPublication publication = mapper.readValue(pub, DLIPublication.class);
|
||||
|
||||
p.mergeFrom(publication);
|
||||
p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
|
||||
// add to the list if they are not null
|
||||
if (publication.getDateofacceptance() != null)
|
||||
dateofacceptance.add(publication.getDateofacceptance().getValue());
|
||||
} catch (Exception exc) {
|
||||
throw new RuntimeException(exc);
|
||||
}
|
||||
});
|
||||
p.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||
if (p.getDataInfo() == null)
|
||||
p.setDataInfo(new DataInfo());
|
||||
p.getDataInfo().setTrust("0.9");
|
||||
p.setLastupdatetimestamp(ts);
|
||||
return p;
|
||||
}
|
||||
|
||||
private static DLIDataset datasetMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||
|
||||
DLIDataset d = new DLIDataset(); // the result of the merge, to be returned at the end
|
||||
|
||||
d.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||
|
||||
if (e._2() != null)
|
||||
e
|
||||
._2()
|
||||
.forEach(
|
||||
dat -> {
|
||||
try {
|
||||
Dataset dataset = mapper.readValue(dat, Dataset.class);
|
||||
|
||||
d.mergeFrom(dataset);
|
||||
d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor()));
|
||||
// add to the list if they are not null
|
||||
if (dataset.getDateofacceptance() != null)
|
||||
dateofacceptance.add(dataset.getDateofacceptance().getValue());
|
||||
} catch (Exception exc) {
|
||||
throw new RuntimeException(exc);
|
||||
}
|
||||
});
|
||||
d.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||
if (d.getDataInfo() == null)
|
||||
d.setDataInfo(new DataInfo());
|
||||
d.getDataInfo().setTrust("0.9");
|
||||
d.setLastupdatetimestamp(ts);
|
||||
return d;
|
||||
}
|
||||
|
||||
private static Project projectMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||
|
||||
Project p = new Project(); // the result of the merge, to be returned at the end
|
||||
|
||||
p.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
if (e._2() != null)
|
||||
e
|
||||
._2()
|
||||
.forEach(
|
||||
proj -> {
|
||||
try {
|
||||
Project project = mapper.readValue(proj, Project.class);
|
||||
|
||||
p.mergeFrom(project);
|
||||
} catch (Exception exc) {
|
||||
throw new RuntimeException(exc);
|
||||
}
|
||||
});
|
||||
if (p.getDataInfo() == null)
|
||||
p.setDataInfo(new DataInfo());
|
||||
p.getDataInfo().setTrust("0.9");
|
||||
p.setLastupdatetimestamp(ts);
|
||||
return p;
|
||||
}
|
||||
|
||||
private static Software softwareMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||
|
||||
Software s = new Software(); // the result of the merge, to be returned at the end
|
||||
|
||||
s.setId(e._1());
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||
if (e._2() != null)
|
||||
e
|
||||
._2()
|
||||
.forEach(
|
||||
soft -> {
|
||||
try {
|
||||
Software software = mapper.readValue(soft, Software.class);
|
||||
|
||||
s.mergeFrom(software);
|
||||
s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor()));
|
||||
// add to the list if they are not null
|
||||
if (software.getDateofacceptance() != null)
|
||||
dateofacceptance.add(software.getDateofacceptance().getValue());
|
||||
} catch (Exception exc) {
|
||||
throw new RuntimeException(exc);
|
||||
}
|
||||
});
|
||||
s.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||
if (s.getDataInfo() == null)
|
||||
s.setDataInfo(new DataInfo());
|
||||
s.getDataInfo().setTrust("0.9");
|
||||
s.setLastupdatetimestamp(ts);
|
||||
return s;
|
||||
}
|
||||
|
||||
private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||
Datasource d = new Datasource(); // the result of the merge, to be returned at the end
|
||||
d.setId(e._1());
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
if (e._2() != null)
|
||||
e
|
||||
._2()
|
||||
.forEach(
|
||||
dat -> {
|
||||
try {
|
||||
Datasource datasource = mapper.readValue(dat, Datasource.class);
|
||||
|
||||
d.mergeFrom(datasource);
|
||||
} catch (Exception exc) {
|
||||
throw new RuntimeException(exc);
|
||||
}
|
||||
});
|
||||
if (d.getDataInfo() == null)
|
||||
d.setDataInfo(new DataInfo());
|
||||
d.getDataInfo().setTrust("0.9");
|
||||
d.setLastupdatetimestamp(ts);
|
||||
return d;
|
||||
}
|
||||
|
||||
private static Organization organizationMerger(
|
||||
Tuple2<String, Iterable<String>> e, final long ts) {
|
||||
|
||||
Organization o = new Organization(); // the result of the merge, to be returned at the end
|
||||
|
||||
o.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
StringBuilder trust = new StringBuilder("0.0");
|
||||
|
||||
if (e._2() != null)
|
||||
e
|
||||
._2()
|
||||
.forEach(
|
||||
pub -> {
|
||||
try {
|
||||
Organization organization = mapper.readValue(pub, Organization.class);
|
||||
|
||||
final String currentTrust = organization.getDataInfo().getTrust();
|
||||
if (!"1.0".equals(currentTrust)) {
|
||||
trust.setLength(0);
|
||||
trust.append(currentTrust);
|
||||
}
|
||||
o.mergeFrom(organization);
|
||||
|
||||
} catch (Exception exc) {
|
||||
throw new RuntimeException(exc);
|
||||
}
|
||||
});
|
||||
|
||||
if (o.getDataInfo() == null) {
|
||||
o.setDataInfo(new DataInfo());
|
||||
}
|
||||
if (o.getDataInfo() == null)
|
||||
o.setDataInfo(new DataInfo());
|
||||
o.getDataInfo().setTrust("0.9");
|
||||
o.setLastupdatetimestamp(ts);
|
||||
|
||||
return o;
|
||||
}
|
||||
|
||||
private static OtherResearchProduct otherresearchproductMerger(
|
||||
Tuple2<String, Iterable<String>> e, final long ts) {
|
||||
|
||||
OtherResearchProduct o = new OtherResearchProduct(); // the result of the merge, to be
|
||||
// returned at the end
|
||||
|
||||
o.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||
|
||||
if (e._2() != null)
|
||||
e
|
||||
._2()
|
||||
.forEach(
|
||||
orp -> {
|
||||
try {
|
||||
OtherResearchProduct otherResearchProduct = mapper
|
||||
.readValue(orp, OtherResearchProduct.class);
|
||||
|
||||
o.mergeFrom(otherResearchProduct);
|
||||
o
|
||||
.setAuthor(
|
||||
DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor()));
|
||||
// add to the list if they are not null
|
||||
if (otherResearchProduct.getDateofacceptance() != null)
|
||||
dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue());
|
||||
} catch (Exception exc) {
|
||||
throw new RuntimeException(exc);
|
||||
}
|
||||
});
|
||||
if (o.getDataInfo() == null)
|
||||
o.setDataInfo(new DataInfo());
|
||||
o.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||
o.getDataInfo().setTrust("0.9");
|
||||
o.setLastupdatetimestamp(ts);
|
||||
return o;
|
||||
}
|
||||
}
|
@ -1,239 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.security.MessageDigest;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.codec.binary.Hex;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.SparkContext;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import com.wcohen.ss.JaroWinkler;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class DedupUtility {
|
||||
private static final Double THRESHOLD = 0.95;
|
||||
|
||||
public static Map<String, LongAccumulator> constructAccumulator(
|
||||
final DedupConfig dedupConf, final SparkContext context) {
|
||||
|
||||
Map<String, LongAccumulator> accumulators = new HashMap<>();
|
||||
|
||||
String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
|
||||
accumulators.put(acc1, context.longAccumulator(acc1));
|
||||
String acc2 = String
|
||||
.format(
|
||||
"%s::%s",
|
||||
dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
|
||||
accumulators.put(acc2, context.longAccumulator(acc2));
|
||||
String acc3 = String
|
||||
.format(
|
||||
"%s::%s",
|
||||
dedupConf.getWf().getEntityType(),
|
||||
String
|
||||
.format(
|
||||
"Skipped records for count(%s) >= %s",
|
||||
dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
|
||||
accumulators.put(acc3, context.longAccumulator(acc3));
|
||||
String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
|
||||
accumulators.put(acc4, context.longAccumulator(acc4));
|
||||
String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
|
||||
accumulators.put(acc5, context.longAccumulator(acc5));
|
||||
String acc6 = String
|
||||
.format(
|
||||
"%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
|
||||
accumulators.put(acc6, context.longAccumulator(acc6));
|
||||
|
||||
return accumulators;
|
||||
}
|
||||
|
||||
public static JavaRDD<String> loadDataFromHDFS(String path, JavaSparkContext context) {
|
||||
return context.textFile(path);
|
||||
}
|
||||
|
||||
public static void deleteIfExists(String path) throws IOException {
|
||||
Configuration conf = new Configuration();
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
if (fileSystem.exists(new Path(path))) {
|
||||
fileSystem.delete(new Path(path), true);
|
||||
}
|
||||
}
|
||||
|
||||
public static DedupConfig loadConfigFromHDFS(String path) throws IOException {
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path)));
|
||||
|
||||
return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name()));
|
||||
}
|
||||
|
||||
static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
|
||||
static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
|
||||
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
||||
}
|
||||
|
||||
public static String md5(final String s) {
|
||||
try {
|
||||
final MessageDigest md = MessageDigest.getInstance("MD5");
|
||||
md.update(s.getBytes(StandardCharsets.UTF_8));
|
||||
return new String(Hex.encodeHex(md.digest()));
|
||||
} catch (final Exception e) {
|
||||
System.err.println("Error creating id");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
||||
int pa = countAuthorsPids(a);
|
||||
int pb = countAuthorsPids(b);
|
||||
List<Author> base, enrich;
|
||||
int sa = authorsSize(a);
|
||||
int sb = authorsSize(b);
|
||||
|
||||
if (pa == pb) {
|
||||
base = sa > sb ? a : b;
|
||||
enrich = sa > sb ? b : a;
|
||||
} else {
|
||||
base = pa > pb ? a : b;
|
||||
enrich = pa > pb ? b : a;
|
||||
}
|
||||
enrichPidFromList(base, enrich);
|
||||
return base;
|
||||
}
|
||||
|
||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
||||
if (base == null || enrich == null)
|
||||
return;
|
||||
final Map<String, Author> basePidAuthorMap = base
|
||||
.stream()
|
||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||
.flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a)))
|
||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||
|
||||
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
||||
.stream()
|
||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||
.flatMap(
|
||||
a -> a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(p -> !basePidAuthorMap.containsKey(p.toComparableString()))
|
||||
.map(p -> new Tuple2<>(p, a)))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
pidToEnrich
|
||||
.forEach(
|
||||
a -> {
|
||||
Optional<Tuple2<Double, Author>> simAuhtor = base
|
||||
.stream()
|
||||
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
||||
.max(Comparator.comparing(Tuple2::_1));
|
||||
if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
|
||||
Author r = simAuhtor.get()._2();
|
||||
r.getPid().add(a._1());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static String createEntityPath(final String basePath, final String entityType) {
|
||||
return String.format("%s/%s", basePath, entityType);
|
||||
}
|
||||
|
||||
public static String createSimRelPath(final String basePath, final String entityType) {
|
||||
return String.format("%s/%s/simRel", basePath, entityType);
|
||||
}
|
||||
|
||||
public static String createMergeRelPath(final String basePath, final String entityType) {
|
||||
return String.format("%s/%s/mergeRel", basePath, entityType);
|
||||
}
|
||||
|
||||
private static Double sim(Author a, Author b) {
|
||||
|
||||
final Person pa = parse(a);
|
||||
final Person pb = parse(b);
|
||||
|
||||
if (pa.isAccurate() & pb.isAccurate()) {
|
||||
return new JaroWinkler()
|
||||
.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
|
||||
} else {
|
||||
return new JaroWinkler()
|
||||
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
||||
}
|
||||
}
|
||||
|
||||
private static String normalize(final String s) {
|
||||
return nfd(s)
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||
// in case
|
||||
// of large input strings
|
||||
.replaceAll("(\\W)+", " ")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
private static String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
private static Person parse(Author author) {
|
||||
if (StringUtils.isNotBlank(author.getSurname())) {
|
||||
return new Person(author.getSurname() + ", " + author.getName(), false);
|
||||
} else {
|
||||
return new Person(author.getFullname(), false);
|
||||
}
|
||||
}
|
||||
|
||||
private static int countAuthorsPids(List<Author> authors) {
|
||||
if (authors == null)
|
||||
return 0;
|
||||
|
||||
return (int) authors.stream().filter(DedupUtility::hasPid).count();
|
||||
}
|
||||
|
||||
private static int authorsSize(List<Author> authors) {
|
||||
if (authors == null)
|
||||
return 0;
|
||||
return authors.size();
|
||||
}
|
||||
|
||||
private static boolean hasPid(Author a) {
|
||||
if (a == null || a.getPid() == null || a.getPid().size() == 0)
|
||||
return false;
|
||||
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
||||
}
|
||||
}
|
@ -1,182 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
import org.apache.spark.api.java.function.PairFlatMapFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.BlockProcessor;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Serializable;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class Deduper implements Serializable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(Deduper.class);
|
||||
|
||||
/**
|
||||
* @return the list of relations generated by the deduplication
|
||||
* @param: the spark context
|
||||
* @param: list of JSON entities to be deduped
|
||||
* @param: the dedup configuration
|
||||
*/
|
||||
public static JavaPairRDD<String, String> dedup(
|
||||
JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
|
||||
|
||||
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
|
||||
|
||||
// create vertexes of the graph: <ID, MapDocument>
|
||||
JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
|
||||
|
||||
// create blocks for deduplication
|
||||
JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
|
||||
|
||||
// create relations by comparing only elements in the same group
|
||||
return computeRelations(context, blocks, config);
|
||||
|
||||
// final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new
|
||||
// Edge<>(it._1().hashCode(),
|
||||
// it._2().hashCode(), "equalTo")).rdd();
|
||||
//
|
||||
// RDD<Tuple2<Object, MapDocument>> vertexes =
|
||||
// mapDocs.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t ->
|
||||
// new
|
||||
// Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2())).rdd();
|
||||
// accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
|
||||
//
|
||||
// return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the list of relations generated by the deduplication
|
||||
* @param: the spark context
|
||||
* @param: list of blocks
|
||||
* @param: the dedup configuration
|
||||
*/
|
||||
public static JavaPairRDD<String, String> computeRelations(
|
||||
JavaSparkContext context,
|
||||
JavaPairRDD<String, Iterable<MapDocument>> blocks,
|
||||
DedupConfig config) {
|
||||
|
||||
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
|
||||
|
||||
return blocks
|
||||
.flatMapToPair(
|
||||
(PairFlatMapFunction<Tuple2<String, Iterable<MapDocument>>, String, String>) it -> {
|
||||
final SparkReporter reporter = new SparkReporter(accumulators);
|
||||
new BlockProcessor(config).process(it._1(), it._2(), reporter);
|
||||
return reporter.getRelations().iterator();
|
||||
})
|
||||
.mapToPair(
|
||||
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item -> new Tuple2<String, Tuple2<String, String>>(
|
||||
item._1() + item._2(), item))
|
||||
.reduceByKey((a, b) -> a)
|
||||
.mapToPair(
|
||||
(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the list of blocks based on clustering of dedup configuration
|
||||
* @param: the spark context
|
||||
* @param: list of entities: <id, entity>
|
||||
* @param: the dedup configuration
|
||||
*/
|
||||
public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(
|
||||
JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
|
||||
return mapDocs
|
||||
// the reduce is just to be sure that we haven't document with same id
|
||||
.reduceByKey((a, b) -> a)
|
||||
.map(Tuple2::_2)
|
||||
// Clustering: from <id, doc> to List<groupkey,doc>
|
||||
.flatMapToPair(
|
||||
(PairFlatMapFunction<MapDocument, String, MapDocument>) a -> DedupUtility
|
||||
.getGroupingKeys(config, a)
|
||||
.stream()
|
||||
.map(it -> new Tuple2<>(it, a))
|
||||
.collect(Collectors.toList())
|
||||
.iterator())
|
||||
.groupByKey();
|
||||
}
|
||||
|
||||
public static JavaPairRDD<String, List<MapDocument>> createsortedBlocks(
|
||||
JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
|
||||
final String of = config.getWf().getOrderField();
|
||||
final int maxQueueSize = config.getWf().getGroupMaxSize();
|
||||
return mapDocs
|
||||
// the reduce is just to be sure that we haven't document with same id
|
||||
.reduceByKey((a, b) -> a)
|
||||
.map(Tuple2::_2)
|
||||
// Clustering: from <id, doc> to List<groupkey,doc>
|
||||
.flatMapToPair(
|
||||
(PairFlatMapFunction<MapDocument, String, List<MapDocument>>) a -> DedupUtility
|
||||
.getGroupingKeys(config, a)
|
||||
.stream()
|
||||
.map(
|
||||
it -> {
|
||||
List<MapDocument> tmp = new ArrayList<>();
|
||||
tmp.add(a);
|
||||
return new Tuple2<>(it, tmp);
|
||||
})
|
||||
.collect(Collectors.toList())
|
||||
.iterator())
|
||||
.reduceByKey(
|
||||
(Function2<List<MapDocument>, List<MapDocument>, List<MapDocument>>) (v1, v2) -> {
|
||||
v1.addAll(v2);
|
||||
v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue()));
|
||||
if (v1.size() > maxQueueSize)
|
||||
return new ArrayList<>(v1.subList(0, maxQueueSize));
|
||||
return v1;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the list of vertexes: <id, mapDocument>
|
||||
* @param: the spark context
|
||||
* @param: list of JSON entities
|
||||
* @param: the dedup configuration
|
||||
*/
|
||||
public static JavaPairRDD<String, MapDocument> mapToVertexes(
|
||||
JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
|
||||
|
||||
return entities
|
||||
.mapToPair(
|
||||
(PairFunction<String, String, MapDocument>) s -> {
|
||||
MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s);
|
||||
return new Tuple2<String, MapDocument>(mapDocument.getIdentifier(), mapDocument);
|
||||
});
|
||||
}
|
||||
|
||||
public static JavaPairRDD<String, String> computeRelations2(
|
||||
JavaSparkContext context, JavaPairRDD<String, List<MapDocument>> blocks, DedupConfig config) {
|
||||
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
|
||||
|
||||
return blocks
|
||||
.flatMapToPair(
|
||||
(PairFlatMapFunction<Tuple2<String, List<MapDocument>>, String, String>) it -> {
|
||||
try {
|
||||
final SparkReporter reporter = new SparkReporter(accumulators);
|
||||
new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter);
|
||||
return reporter.getRelations().iterator();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(it._2().get(0).getIdentifier(), e);
|
||||
}
|
||||
})
|
||||
.mapToPair(
|
||||
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item -> new Tuple2<String, Tuple2<String, String>>(
|
||||
item._1() + item._2(), item))
|
||||
.reduceByKey((a, b) -> a)
|
||||
.mapToPair(
|
||||
(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||
}
|
||||
}
|
@ -1,6 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup;
|
||||
|
||||
public enum OafEntityType {
|
||||
datasource, organization, project, dataset, otherresearchproduct, software, publication
|
||||
}
|
@ -1,112 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.graphx.Edge;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
import eu.dnetlib.dedup.graph.ConnectedComponent;
|
||||
import eu.dnetlib.dedup.graph.GraphProcessor;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkCreateConnectedComponent {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateConnectedComponent.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkCreateConnectedComponent.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String entity = parser.get("entity");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
|
||||
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||
|
||||
final JavaPairRDD<Object, String> vertexes = spark
|
||||
.read()
|
||||
.load(inputPath + "/" + entity)
|
||||
.as(Encoders.kryo(Oaf.class))
|
||||
.map((MapFunction<Oaf, String>) p -> new ObjectMapper().writeValueAsString(p), Encoders.STRING())
|
||||
.javaRDD()
|
||||
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||
.mapToPair(
|
||||
(PairFunction<String, Object, String>) s -> new Tuple2<Object, String>(getHashcode(s), s));
|
||||
|
||||
final Dataset<Relation> similarityRelations = spark
|
||||
.read()
|
||||
.load(DedupUtility.createSimRelPath(targetPath, entity))
|
||||
.as(Encoders.bean(Relation.class));
|
||||
final RDD<Edge<String>> edgeRdd = similarityRelations
|
||||
.javaRDD()
|
||||
.map(
|
||||
it -> new Edge<>(
|
||||
getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass()))
|
||||
.rdd();
|
||||
final JavaRDD<ConnectedComponent> cc = GraphProcessor
|
||||
.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
|
||||
.toJavaRDD();
|
||||
final Dataset<Relation> mergeRelation = spark
|
||||
.createDataset(
|
||||
cc
|
||||
.filter(k -> k.getDocIds().size() > 1)
|
||||
.flatMap(
|
||||
(FlatMapFunction<ConnectedComponent, Relation>) c -> c
|
||||
.getDocIds()
|
||||
.stream()
|
||||
.flatMap(
|
||||
id -> {
|
||||
List<Relation> tmp = new ArrayList<>();
|
||||
Relation r = new Relation();
|
||||
r.setSource(c.getCcId());
|
||||
r.setTarget(id);
|
||||
r.setRelClass(ModelConstants.MERGES);
|
||||
tmp.add(r);
|
||||
r = new Relation();
|
||||
r.setTarget(c.getCcId());
|
||||
r.setSource(id);
|
||||
r.setRelClass(ModelConstants.IS_MERGED_IN);
|
||||
tmp.add(r);
|
||||
return tmp.stream();
|
||||
})
|
||||
.iterator())
|
||||
.rdd(),
|
||||
Encoders.bean(Relation.class));
|
||||
mergeRelation
|
||||
.write()
|
||||
.mode("overwrite")
|
||||
.save(DedupUtility.createMergeRelPath(targetPath, entity));
|
||||
}
|
||||
|
||||
public static long getHashcode(final String id) {
|
||||
return Hashing.murmur3_128().hashString(id).asLong();
|
||||
}
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
||||
public class SparkCreateDedupRecord {
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateDedupRecord.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkCreateDedupRecord.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
final String entity = parser.get("entity");
|
||||
final String dedupPath = parser.get("dedupPath");
|
||||
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||
|
||||
final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory
|
||||
.createDedupRecord(
|
||||
sc,
|
||||
spark,
|
||||
DedupUtility.createMergeRelPath(dedupPath, entity),
|
||||
DedupUtility.createEntityPath(sourcePath, entity),
|
||||
OafEntityType.valueOf(entity),
|
||||
dedupConf);
|
||||
spark
|
||||
.createDataset(dedupRecord.rdd(), Encoders.kryo(OafEntity.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(dedupPath + "/" + entity + "/dedup_records");
|
||||
//
|
||||
//
|
||||
// dedupRecord
|
||||
// .map(
|
||||
// r -> {
|
||||
// ObjectMapper mapper = new ObjectMapper();
|
||||
// return mapper.writeValueAsString(r);
|
||||
// })
|
||||
// .saveAsTextFile(dedupPath + "/" + entity + "/dedup_records");
|
||||
}
|
||||
}
|
@ -1,92 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* This Spark class creates similarity relations between entities, saving result
|
||||
* <p>
|
||||
* param request: sourcePath entityType target Path
|
||||
*/
|
||||
public class SparkCreateSimRels {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateSimRels.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkCreateSimRels.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String entity = parser.get("entity");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
// final DedupConfig dedupConf =
|
||||
// DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
|
||||
|
||||
JavaPairRDD<String, MapDocument> mapDocument = spark
|
||||
.read()
|
||||
.load(inputPath + "/" + entity)
|
||||
.as(Encoders.kryo(Oaf.class))
|
||||
.map((MapFunction<Oaf, String>) p -> new ObjectMapper().writeValueAsString(p), Encoders.STRING())
|
||||
.javaRDD()
|
||||
.repartition(1000)
|
||||
.mapToPair(
|
||||
s -> {
|
||||
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
|
||||
return new Tuple2<>(d.getIdentifier(), d);
|
||||
});
|
||||
|
||||
// create blocks for deduplication
|
||||
JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
|
||||
// JavaPairRDD<String, Iterable<MapDocument>> blocks = Deduper.createBlocks(sc,
|
||||
// mapDocument, dedupConf);
|
||||
|
||||
// create relations by comparing only elements in the same group
|
||||
final JavaPairRDD<String, String> dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
|
||||
// final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations(sc, blocks,
|
||||
// dedupConf);
|
||||
|
||||
final JavaRDD<Relation> isSimilarToRDD = dedupRels
|
||||
.map(
|
||||
simRel -> {
|
||||
final Relation r = new Relation();
|
||||
r.setSource(simRel._1());
|
||||
r.setTarget(simRel._2());
|
||||
r.setRelClass("isSimilarTo");
|
||||
return r;
|
||||
});
|
||||
|
||||
spark
|
||||
.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.mode("overwrite")
|
||||
.save(DedupUtility.createSimRelPath(targetPath, entity));
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
||||
import eu.dnetlib.pace.util.Reporter;
|
||||
import scala.Serializable;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkReporter implements Serializable, Reporter {
|
||||
|
||||
final List<Tuple2<String, String>> relations = new ArrayList<>();
|
||||
private static final Log log = LogFactory.getLog(SparkReporter.class);
|
||||
Map<String, LongAccumulator> accumulators;
|
||||
|
||||
public SparkReporter(Map<String, LongAccumulator> accumulators) {
|
||||
this.accumulators = accumulators;
|
||||
}
|
||||
|
||||
public void incrementCounter(
|
||||
String counterGroup,
|
||||
String counterName,
|
||||
long delta,
|
||||
Map<String, LongAccumulator> accumulators) {
|
||||
|
||||
final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
|
||||
if (accumulators.containsKey(accumulatorName)) {
|
||||
accumulators.get(accumulatorName).add(delta);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void incrementCounter(String counterGroup, String counterName, long delta) {
|
||||
|
||||
incrementCounter(counterGroup, counterName, delta, accumulators);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void emit(String type, String from, String to) {
|
||||
relations.add(new Tuple2<>(from, to));
|
||||
}
|
||||
|
||||
public List<Tuple2<String, String>> getRelations() {
|
||||
return relations;
|
||||
}
|
||||
}
|
@ -1,84 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup.graph;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dedup.DedupUtility;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public class ConnectedComponent implements Serializable {
|
||||
|
||||
private Set<String> docIds;
|
||||
private String ccId;
|
||||
|
||||
public ConnectedComponent() {
|
||||
}
|
||||
|
||||
public ConnectedComponent(Set<String> docIds) {
|
||||
this.docIds = docIds;
|
||||
createID();
|
||||
}
|
||||
|
||||
public String createID() {
|
||||
if (docIds.size() > 1) {
|
||||
final String s = getMin();
|
||||
String prefix = s.split("\\|")[0];
|
||||
ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s);
|
||||
return ccId;
|
||||
} else {
|
||||
return docIds.iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public String getMin() {
|
||||
|
||||
final StringBuilder min = new StringBuilder();
|
||||
docIds
|
||||
.forEach(
|
||||
i -> {
|
||||
if (StringUtils.isBlank(min.toString())) {
|
||||
min.append(i);
|
||||
} else {
|
||||
if (min.toString().compareTo(i) > 0) {
|
||||
min.setLength(0);
|
||||
min.append(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
return min.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
try {
|
||||
return mapper.writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Failed to create Json: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Set<String> getDocIds() {
|
||||
return docIds;
|
||||
}
|
||||
|
||||
public void setDocIds(Set<String> docIds) {
|
||||
this.docIds = docIds;
|
||||
}
|
||||
|
||||
public String getCcId() {
|
||||
return ccId;
|
||||
}
|
||||
|
||||
public void setCcId(String ccId) {
|
||||
this.ccId = ccId;
|
||||
}
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
package eu.dnetlib.dedup.graph
|
||||
|
||||
import org.apache.spark.graphx._
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
||||
import scala.collection.JavaConversions;
|
||||
|
||||
object GraphProcessor {
|
||||
|
||||
def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
|
||||
val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
|
||||
val cc = graph.connectedComponents(maxIterations).vertices
|
||||
|
||||
val joinResult = vertexes.leftOuterJoin(cc).map {
|
||||
case (id, (openaireId, cc)) => {
|
||||
if (cc.isEmpty) {
|
||||
(id, openaireId)
|
||||
}
|
||||
else {
|
||||
(cc.get, openaireId)
|
||||
}
|
||||
}
|
||||
}
|
||||
val connectedComponents = joinResult.groupByKey()
|
||||
.map[ConnectedComponent](cc => asConnectedComponent(cc))
|
||||
connectedComponents
|
||||
}
|
||||
|
||||
|
||||
|
||||
def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
|
||||
val docs = group._2.toSet[String]
|
||||
val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
|
||||
connectedComponent
|
||||
}
|
||||
|
||||
}
|
@ -1,78 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup.sx;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkPropagateRelationsJob {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkPropagateRelationsJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkUpdateEntityJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
|
||||
final String relationPath = parser.get("relationPath");
|
||||
final String mergeRelPath = parser.get("mergeRelPath");
|
||||
final String targetRelPath = parser.get("targetRelPath");
|
||||
|
||||
final Dataset<Relation> merge = spark
|
||||
.read()
|
||||
.load(mergeRelPath)
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.where("relClass == 'merges'");
|
||||
|
||||
final Dataset<Relation> rels = spark
|
||||
.read()
|
||||
.load(relationPath)
|
||||
.as(Encoders.kryo(Relation.class))
|
||||
.map(
|
||||
(MapFunction<Relation, Relation>) r -> r,
|
||||
Encoders.bean(Relation.class));
|
||||
|
||||
final Dataset<Relation> firstJoin = rels
|
||||
.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer")
|
||||
.map(
|
||||
(MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
|
||||
final Relation mergeRelation = r._2();
|
||||
final Relation relation = r._1();
|
||||
if (mergeRelation != null)
|
||||
relation.setSource(mergeRelation.getSource());
|
||||
if (relation.getDataInfo() == null)
|
||||
relation.setDataInfo(OafUtils.generateDataInfo("0.9", false));
|
||||
return relation;
|
||||
},
|
||||
Encoders.bean(Relation.class));
|
||||
|
||||
final Dataset<Relation> secondJoin = firstJoin
|
||||
.joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer")
|
||||
.map(
|
||||
(MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
|
||||
final Relation mergeRelation = r._2();
|
||||
final Relation relation = r._1();
|
||||
if (mergeRelation != null)
|
||||
relation.setTarget(mergeRelation.getSource());
|
||||
return relation;
|
||||
},
|
||||
Encoders.kryo(Relation.class));
|
||||
|
||||
secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath);
|
||||
}
|
||||
}
|
@ -1,102 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dedup.sx;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkUpdateEntityJob {
|
||||
|
||||
static final String IDJSONPATH = "$.id";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkUpdateEntityJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkUpdateEntityJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String entityPath = parser.get("entityPath");
|
||||
final String mergeRelPath = parser.get("mergeRelPath");
|
||||
final String dedupRecordPath = parser.get("dedupRecordPath");
|
||||
final String entity = parser.get("entity");
|
||||
final String destination = parser.get("targetPath");
|
||||
|
||||
final Dataset<Relation> df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
|
||||
final JavaPairRDD<String, String> mergedIds = df
|
||||
.where("relClass == 'merges'")
|
||||
.select(df.col("target"))
|
||||
.distinct()
|
||||
.toJavaRDD()
|
||||
.mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "d"));
|
||||
final JavaRDD<String> sourceEntity = sc.textFile(entityPath);
|
||||
|
||||
final JavaRDD<String> dedupEntity = sc.textFile(dedupRecordPath);
|
||||
JavaPairRDD<String, String> entitiesWithId = sourceEntity
|
||||
.mapToPair(
|
||||
(PairFunction<String, String, String>) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s));
|
||||
Class<? extends Oaf> mainClass;
|
||||
switch (entity) {
|
||||
case "publication":
|
||||
mainClass = DLIPublication.class;
|
||||
break;
|
||||
case "dataset":
|
||||
mainClass = DLIDataset.class;
|
||||
break;
|
||||
case "unknown":
|
||||
mainClass = DLIUnknown.class;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Illegal type " + entity);
|
||||
}
|
||||
JavaRDD<String> map = entitiesWithId
|
||||
.leftOuterJoin(mergedIds)
|
||||
.map(
|
||||
k -> k._2()._2().isPresent()
|
||||
? updateDeletedByInference(k._2()._1(), mainClass)
|
||||
: k._2()._1());
|
||||
map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> String updateDeletedByInference(
|
||||
final String json, final Class<T> clazz) {
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
try {
|
||||
Oaf entity = mapper.readValue(json, clazz);
|
||||
if (entity.getDataInfo() == null)
|
||||
entity.setDataInfo(new DataInfo());
|
||||
entity.getDataInfo().setDeletedbyinference(true);
|
||||
return mapper.writeValueAsString(entity);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to convert json", e);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,75 +0,0 @@
|
||||
package eu.dnetlib.dedup.sx
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown, OafUtils}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.LoggerFactory
|
||||
import org.apache.spark.sql.functions.col
|
||||
|
||||
object SparkUpdateEntityWithDedupInfo {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityWithDedupInfo.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json")))
|
||||
val logger = LoggerFactory.getLogger(SparkUpdateEntityWithDedupInfo.getClass)
|
||||
parser.parseArgument(args)
|
||||
|
||||
val workingPath: String = parser.get("workingPath")
|
||||
logger.info(s"Working dir path = $workingPath")
|
||||
|
||||
implicit val oafEncoder: Encoder[OafEntity] = Encoders.kryo[OafEntity]
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation])
|
||||
|
||||
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
|
||||
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
|
||||
implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
|
||||
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.appName(SparkUpdateEntityWithDedupInfo.getClass.getSimpleName)
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
|
||||
val entityPath = parser.get("entityPath")
|
||||
val mergeRelPath = parser.get("mergeRelPath")
|
||||
val dedupRecordPath = parser.get("dedupRecordPath")
|
||||
val entity = parser.get("entity")
|
||||
val destination = parser.get("targetPath")
|
||||
|
||||
val mergedIds = spark.read.load(mergeRelPath).as[Relation]
|
||||
.where("relClass == 'merges'")
|
||||
.select(col("target"))
|
||||
|
||||
|
||||
val entities: Dataset[(String, OafEntity)] = spark
|
||||
.read
|
||||
.load(entityPath).as[OafEntity]
|
||||
.map(o => (o.getId, o))(Encoders.tuple(Encoders.STRING, oafEncoder))
|
||||
|
||||
|
||||
val finalDataset:Dataset[OafEntity] = entities.joinWith(mergedIds, entities("_1").equalTo(mergedIds("target")), "left")
|
||||
.map(k => {
|
||||
val e: OafEntity = k._1._2
|
||||
val t = k._2
|
||||
if (t != null && t.getString(0).nonEmpty) {
|
||||
if (e.getDataInfo == null) {
|
||||
e.setDataInfo(OafUtils.generateDataInfo())
|
||||
}
|
||||
e.getDataInfo.setDeletedbyinference(true)
|
||||
}
|
||||
e
|
||||
})
|
||||
|
||||
val dedupRecords :Dataset[OafEntity] = spark.read.load(dedupRecordPath).as[OafEntity]
|
||||
|
||||
finalDataset.union(dedupRecords)
|
||||
.repartition(1200).write
|
||||
.mode(SaveMode.Overwrite).save(destination)
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the path of the sequential file to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "e",
|
||||
"paramLongName": "entity",
|
||||
"paramDescription": "the type of entity to be deduped",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "c",
|
||||
"paramLongName": "dedupConf",
|
||||
"paramDescription": "dedup configuration to be used",
|
||||
"compressed": true,
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "dedupPath",
|
||||
"paramDescription": "dedup path to load mergeRelation",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,38 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ep",
|
||||
"paramLongName": "entityPath",
|
||||
"paramDescription": "the input entity path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mr",
|
||||
"paramLongName": "mergeRelPath",
|
||||
"paramDescription": "the input path of merge Rel",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dr",
|
||||
"paramLongName": "dedupRecordPath",
|
||||
"paramDescription": "the inputPath of dedup record",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "e",
|
||||
"paramLongName": "entity",
|
||||
"paramDescription": "the type of entity",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramDescription": "the targetPath",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,33 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the path of the sequential file to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "e",
|
||||
"paramLongName": "entity",
|
||||
"paramDescription": "the type of entity to be deduped",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "c",
|
||||
"paramLongName": "dedupConf",
|
||||
"paramDescription": "dedup configuration to be used",
|
||||
"compressed": true,
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramDescription": "target path to save dedup result",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,26 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ep",
|
||||
"paramLongName": "relationPath",
|
||||
"paramDescription": "the input relation path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mr",
|
||||
"paramLongName": "mergeRelPath",
|
||||
"paramDescription": "the input path of merge Rel",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetRelPath",
|
||||
"paramDescription": "the output Rel Path",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,18 +0,0 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,182 +0,0 @@
|
||||
<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>entity</name>
|
||||
<description>the entity that should be processed</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>dedupConf</name>
|
||||
<description>the dedup Configuration</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the target path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
<start to="DeleteWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="DeleteWorkingPath">
|
||||
<fs>
|
||||
<delete path='${targetPath}/${entity}'/>
|
||||
<mkdir path="${targetPath}"/>
|
||||
<mkdir path="${targetPath}/${entity}"/>
|
||||
</fs>
|
||||
<ok to="CreateSimRels"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="CreateSimRels">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create Similarity Relations</name>
|
||||
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
|
||||
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg><arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--entity</arg><arg>${entity}</arg>
|
||||
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
||||
</spark>
|
||||
<ok to="CreateConnectedComponents"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="CreateConnectedComponents">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create Connected Components</name>
|
||||
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
|
||||
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg><arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--entity</arg><arg>${entity}</arg>
|
||||
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
||||
</spark>
|
||||
<ok to="CreateDedupRecord"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="CreateDedupRecord">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create Dedup Record</name>
|
||||
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
|
||||
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg><arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--dedupPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--entity</arg><arg>${entity}</arg>
|
||||
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
|
||||
</spark>
|
||||
<ok to="fixRelation"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="fixRelation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Propagate Dedup Relations</name>
|
||||
<class>eu.dnetlib.dedup.sx.SparkPropagateRelationsJob</class>
|
||||
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg><arg>yarn-cluster</arg>
|
||||
<arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
|
||||
<arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
|
||||
<arg>--targetRelPath</arg><arg>${targetPath}/${entity}/updated_relation</arg>
|
||||
</spark>
|
||||
<ok to="updateDeletedByInferenceEntity"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="updateDeletedByInferenceEntity">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Update ${entity} and add DedupRecord</name>
|
||||
<class>eu.dnetlib.dedup.sx.SparkUpdateEntityWithDedupInfo</class>
|
||||
<jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg><arg>yarn-cluster</arg>
|
||||
<arg>--entityPath</arg><arg>${sourcePath}/${entity}</arg>
|
||||
<arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
|
||||
<arg>--entity</arg><arg>${entity}</arg>
|
||||
<arg>--dedupRecordPath</arg><arg>${targetPath}/${entity}/dedup_records</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/${entity}/updated_record</arg>
|
||||
</spark>
|
||||
<ok to="replaceEntity"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="replaceEntity">
|
||||
<fs>
|
||||
<delete path='${sourcePath}/${entity}'/>
|
||||
<delete path='${sourcePath}/relation'/>
|
||||
<move source="${targetPath}/${entity}/updated_relation" target="${sourcePath}/relation" />
|
||||
<move source="${targetPath}/${entity}/updated_record" target="${sourcePath}/${entity}" />
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,79 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-graph-provision-scholexplorer</artifactId>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpmime</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.elasticsearch</groupId>
|
||||
<artifactId>elasticsearch-hadoop</artifactId>
|
||||
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
@ -1,425 +0,0 @@
|
||||
package eu.dnetlib.dhp.export
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
|
||||
import java.time.LocalDateTime
|
||||
import java.time.format.DateTimeFormatter
|
||||
import eu.dnetlib.dhp.common.PacePerson
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
||||
case class DLIExternalReference(id: String, url: String, sitename: String, label: String, pid: String, classId: String) {}
|
||||
|
||||
object DLIToOAF {
|
||||
|
||||
|
||||
val collectedFromMap: Map[String, KeyValue] = Map(
|
||||
"dli_________::r3d100010527" -> generateKeyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive"),
|
||||
"dli_________::r3d100010255" -> generateKeyValue("10|re3data_____::480d275ed6f9666ee76d6a1215eabf26", "Inter-university Consortium for Political and Social Research"),
|
||||
"dli_________::r3d100011868" -> generateKeyValue("10|re3data_____::db814dc656a911b556dba42a331cebe9", "Mendeley Data"),
|
||||
"dli_________::elsevier" -> generateKeyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier"),
|
||||
"dli_________::openaire" -> generateKeyValue("10|infrastruct_::f66f1bd369679b5b077dcdf006089556", "OpenAIRE"),
|
||||
"dli_________::thomsonreuters" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref"),
|
||||
"dli_________::r3d100010216" -> generateKeyValue("10|re3data_____::0fd79429de04343dbbec705d9b5f429f", "4TU.Centre for Research Data"),
|
||||
"dli_________::r3d100010134" -> generateKeyValue("10|re3data_____::9633d1e8c4309c833c2c442abeb0cfeb", "PANGAEA"),
|
||||
"dli_________::ieee" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref"),
|
||||
"dli_________::r3d100010197" -> generateKeyValue("10|re3data_____::9fd1d79973f7fda60cbe1d82e3819a68", "The Cambridge Structural Database"),
|
||||
"dli_________::nature" -> generateKeyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature"),
|
||||
"dli_________::datacite" -> generateKeyValue("10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite"),
|
||||
"dli_________::r3d100010578" -> generateKeyValue("10|re3data_____::c4d751f29a7568011a4c80136b30b444", "IEDA"),
|
||||
"dli_________::r3d100010464" -> generateKeyValue("10|re3data_____::23e2a81591099828f6b83a1c83150666", "Research Data Australia"),
|
||||
"dli_________::r3d100010327" -> generateKeyValue("10|re3data_____::a644620b81135243dc9acc15d2362246", "Worldwide Protein Data Bank"),
|
||||
"dli_________::pubmed" -> generateKeyValue("10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357", "PubMed Central"),
|
||||
"dli_________::europe_pmc__" -> generateKeyValue("10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", "Europe PubMed Central"),
|
||||
"dli_________::crossref" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref")
|
||||
)
|
||||
|
||||
|
||||
val relationTypeMapping: Map[String, (String, String)] = Map(
|
||||
"IsReferencedBy" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
"References" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
"IsRelatedTo" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
"IsSupplementedBy" -> (ModelConstants.IS_SUPPLEMENTED_BY, ModelConstants.SUPPLEMENT),
|
||||
"Documents" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
"Cites" -> (ModelConstants.CITES, ModelConstants.CITATION),
|
||||
"Unknown" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
"IsSourceOf" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
"IsCitedBy" -> (ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
|
||||
"Reviews" -> (ModelConstants.REVIEWS, ModelConstants.REVIEW),
|
||||
"Describes" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
"HasAssociationWith" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP)
|
||||
)
|
||||
|
||||
val expectecdPidType = List("uniprot", "ena", "chembl", "ncbi-n", "ncbi-p", "genbank", "pdb", "url")
|
||||
|
||||
|
||||
val filteredURL = List(
|
||||
"www.ebi.ac.uk",
|
||||
"www.uniprot.org",
|
||||
"f1000.com",
|
||||
"en.wikipedia.org",
|
||||
"flybase.org",
|
||||
"www.yeastgenome.org",
|
||||
"research.bioinformatics.udel.edu",
|
||||
"cancer.sanger.ac.uk",
|
||||
"www.iedb.org",
|
||||
"www.crd.york.ac.uk",
|
||||
"www.wormbase.org",
|
||||
"web.expasy.org",
|
||||
"www.hal.inserm.fr",
|
||||
"sabiork.h-its.org",
|
||||
"zfin.org",
|
||||
"www.pombase.org",
|
||||
"www.guidetopharmacology.org",
|
||||
"reactome.org"
|
||||
)
|
||||
|
||||
|
||||
val rel_inverse: Map[String, String] = Map(
|
||||
ModelConstants.IS_RELATED_TO -> ModelConstants.IS_RELATED_TO,
|
||||
ModelConstants.IS_SUPPLEMENTED_BY -> ModelConstants.IS_SUPPLEMENT_TO,
|
||||
ModelConstants.CITES -> ModelConstants.IS_CITED_BY,
|
||||
ModelConstants.IS_CITED_BY -> ModelConstants.CITES,
|
||||
ModelConstants.REVIEWS -> ModelConstants.IS_REVIEWED_BY
|
||||
)
|
||||
|
||||
|
||||
val PidTypeMap: Map[String, String] = Map(
|
||||
"pbmid" -> "pmid",
|
||||
"pmcid" -> "pmc",
|
||||
"pmid" -> "pmid",
|
||||
"pubmedid" -> "pmid",
|
||||
"DOI" -> "doi",
|
||||
"doi" -> "doi"
|
||||
)
|
||||
|
||||
|
||||
def fixInstance(r:Publication) :Publication = {
|
||||
val collectedFrom = r.getCollectedfrom.asScala.head
|
||||
r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
|
||||
r
|
||||
}
|
||||
|
||||
|
||||
def fixInstanceDataset(r:Dataset) :Dataset = {
|
||||
val collectedFrom = r.getCollectedfrom.asScala.head
|
||||
r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
|
||||
r
|
||||
}
|
||||
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: Dataset =>
|
||||
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
|
||||
a.setClazz(classOf[Dataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
def convertClinicalTrial(dataset: DLIDataset): (String, String) = {
|
||||
val currentId = generateId(dataset.getId)
|
||||
val pids = dataset.getPid.asScala.filter(p => "clinicaltrials.gov".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => s"50|r3111dacbab5::${DHPUtils.md5(p.getValue.toLowerCase())}")
|
||||
if (pids.isEmpty)
|
||||
null
|
||||
else
|
||||
(currentId, pids.head)
|
||||
}
|
||||
|
||||
|
||||
def insertExternalRefs(publication: Publication, externalReferences: List[DLIExternalReference]): Publication = {
|
||||
|
||||
val eRefs = externalReferences.map(e => {
|
||||
val result = new ExternalReference()
|
||||
result.setSitename(e.sitename)
|
||||
result.setLabel(e.label)
|
||||
result.setUrl(e.url)
|
||||
result.setRefidentifier(e.pid)
|
||||
result.setDataInfo(generateDataInfo())
|
||||
result.setQualifier(createQualifier(e.classId, ModelConstants.DNET_EXTERNAL_REFERENCE_TYPE))
|
||||
result
|
||||
})
|
||||
publication.setExternalReference(eRefs.asJava)
|
||||
publication
|
||||
|
||||
}
|
||||
|
||||
def filterPid(p: StructuredProperty): Boolean = {
|
||||
if (expectecdPidType.contains(p.getQualifier.getClassname) && p.getQualifier.getClassname.equalsIgnoreCase("url"))
|
||||
if (filteredURL.exists(u => p.getValue.contains(u)))
|
||||
return true
|
||||
else
|
||||
return false
|
||||
expectecdPidType.contains(p.getQualifier.getClassname)
|
||||
}
|
||||
|
||||
|
||||
def extractTitle(titles: java.util.List[StructuredProperty]): String = {
|
||||
|
||||
if (titles == null)
|
||||
return null
|
||||
|
||||
val label = titles.asScala.map(p => p.getValue).find(p => p.nonEmpty)
|
||||
label.orNull
|
||||
}
|
||||
|
||||
def convertDLIDatasetToExternalReference(dataset: DLIDataset): DLIExternalReference = {
|
||||
val pids = dataset.getPid.asScala.filter(filterPid)
|
||||
|
||||
if (pids == null || pids.isEmpty)
|
||||
return null
|
||||
|
||||
val pid: StructuredProperty = pids.head
|
||||
|
||||
|
||||
pid.getQualifier.getClassname match {
|
||||
case "uniprot" => DLIExternalReference(generateId(dataset.getId), s"https://www.uniprot.org/uniprot/${pid.getValue}", "UniProt", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
|
||||
case "ena" =>
|
||||
if (pid.getValue != null && pid.getValue.nonEmpty && pid.getValue.length > 7)
|
||||
DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/ena/data/view/${pid.getValue.substring(0, 8)}", "European Nucleotide Archive", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
|
||||
else
|
||||
null
|
||||
case "chembl" => DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/chembl/compound_report_card/${pid.getValue}", "ChEMBL", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
|
||||
case "ncbi-n" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Nucleotide Database", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
|
||||
case "ncbi-p" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Nucleotide Database", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
|
||||
case "genbank" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "GenBank", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
|
||||
case "pdb" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Protein Data Bank", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
|
||||
case "url" => DLIExternalReference(generateId(dataset.getId), pid.getValue, "", extractTitle(dataset.getTitle), pid.getValue, "url")
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def convertDLIPublicationToOAF(inputPublication: DLIPublication): Publication = {
|
||||
val result = new Publication
|
||||
val cleanedPids = inputPublication.getPid.asScala.filter(p => PidTypeMap.contains(p.getQualifier.getClassid))
|
||||
.map(p => {
|
||||
p.setQualifier(createQualifier(PidTypeMap(p.getQualifier.getClassid), p.getQualifier.getSchemeid))
|
||||
p
|
||||
})
|
||||
if (cleanedPids.isEmpty)
|
||||
return null
|
||||
result.setId(generateId(inputPublication.getId))
|
||||
result.setDataInfo(generateDataInfo(invisible = true))
|
||||
if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null))
|
||||
return null
|
||||
result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
|
||||
if(result.getCollectedfrom.isEmpty)
|
||||
return null
|
||||
result.setPid(cleanedPids.asJava)
|
||||
result.setDateofcollection(inputPublication.getDateofcollection)
|
||||
result.setOriginalId(inputPublication.getPid.asScala.map(p => p.getValue).asJava)
|
||||
result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
|
||||
if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty)
|
||||
return null
|
||||
result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava)
|
||||
result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES))
|
||||
|
||||
if (inputPublication.getSubject != null)
|
||||
result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava)
|
||||
|
||||
if (inputPublication.getTitle == null || inputPublication.getTitle.isEmpty)
|
||||
return null
|
||||
|
||||
result.setTitle(List(patchTitle(inputPublication.getTitle.get(0))).asJava)
|
||||
|
||||
if (inputPublication.getRelevantdate == null || inputPublication.getRelevantdate.size() == 0)
|
||||
return null
|
||||
|
||||
result.setRelevantdate(inputPublication.getRelevantdate.asScala.map(patchRelevantDate).asJava)
|
||||
|
||||
|
||||
result.setDescription(inputPublication.getDescription)
|
||||
|
||||
result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue))
|
||||
result.setPublisher(inputPublication.getPublisher)
|
||||
result.setSource(inputPublication.getSource)
|
||||
result.setBestaccessright(createAccessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
|
||||
|
||||
val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue)
|
||||
if (dois.isEmpty)
|
||||
return null
|
||||
|
||||
|
||||
val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(inputPublication.getInstance()), result.getDateofacceptance)
|
||||
|
||||
if (i != null)
|
||||
result.setInstance(List(i).asJava)
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
def convertDLIRelation(r: Relation): Relation = {
|
||||
|
||||
val rt = r.getRelType
|
||||
if (!relationTypeMapping.contains(rt))
|
||||
return null
|
||||
r.setRelType(ModelConstants.RESULT_RESULT)
|
||||
r.setRelClass(relationTypeMapping(rt)._1)
|
||||
r.setSubRelType(relationTypeMapping(rt)._2)
|
||||
r.setSource(generateId(r.getSource))
|
||||
r.setTarget(generateId(r.getTarget))
|
||||
r
|
||||
}
|
||||
|
||||
|
||||
def convertDLIDatasetTOOAF(d: DLIDataset): Dataset = {
|
||||
|
||||
if (d.getCollectedfrom == null || d.getCollectedfrom.size() == 0 || (d.getCollectedfrom.size() == 1 && d.getCollectedfrom.get(0) == null))
|
||||
return null
|
||||
val result: Dataset = new Dataset
|
||||
result.setId(generateId(d.getId))
|
||||
result.setDataInfo(generateDataInfo())
|
||||
result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
|
||||
if(result.getCollectedfrom.isEmpty)
|
||||
return null
|
||||
|
||||
|
||||
result.setPid(d.getPid)
|
||||
|
||||
val fpids = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname) ||
|
||||
"pdb".equalsIgnoreCase(p.getQualifier.getClassname)
|
||||
).map(p => p.getValue)
|
||||
|
||||
if (fpids == null || fpids.isEmpty)
|
||||
return null
|
||||
|
||||
|
||||
result.setDateofcollection(d.getDateofcollection)
|
||||
result.setOriginalId(d.getPid.asScala.map(d => d.getValue).asJava)
|
||||
result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
|
||||
if (d.getAuthor == null || d.getAuthor.isEmpty)
|
||||
return null
|
||||
result.setAuthor(d.getAuthor.asScala.map(convertAuthor).asJava)
|
||||
result.setResulttype(createQualifier(d.getResulttype.getClassid, d.getResulttype.getClassname, ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES))
|
||||
|
||||
if (d.getSubject != null)
|
||||
result.setSubject(d.getSubject.asScala.map(convertSubject).asJava)
|
||||
|
||||
if (d.getTitle == null || d.getTitle.isEmpty)
|
||||
return null
|
||||
|
||||
result.setTitle(List(patchTitle(d.getTitle.get(0))).asJava)
|
||||
|
||||
if (d.getRelevantdate == null || d.getRelevantdate.size() == 0)
|
||||
return null
|
||||
|
||||
result.setRelevantdate(d.getRelevantdate.asScala.map(patchRelevantDate).asJava)
|
||||
|
||||
|
||||
result.setDescription(d.getDescription)
|
||||
|
||||
result.setDateofacceptance(asField(d.getRelevantdate.get(0).getValue))
|
||||
result.setPublisher(d.getPublisher)
|
||||
result.setSource(d.getSource)
|
||||
result.setBestaccessright(createAccessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
|
||||
|
||||
|
||||
val instance_urls = if (fpids.head.length < 5) s"https://www.rcsb.org/structure/${fpids.head}" else s"https://dx.doi.org/${fpids.head}"
|
||||
|
||||
val i: Instance = createInstance(instance_urls, firstInstanceOrNull(d.getInstance()), result.getDateofacceptance, true)
|
||||
|
||||
// Ticket #6281 added pid to Instance
|
||||
i.setPid(result.getPid)
|
||||
if (i != null)
|
||||
result.setInstance(List(i).asJava)
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
def firstInstanceOrNull(instances: java.util.List[Instance]): Instance = {
|
||||
|
||||
if (instances == null || instances.size() == 0)
|
||||
return null
|
||||
instances.get(0)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def createInstance(url: String, originalInstance: Instance, doa: Field[String], dataset: Boolean = false): Instance = {
|
||||
|
||||
val i = new Instance
|
||||
i.setUrl(List(url).asJava)
|
||||
if (dataset)
|
||||
i.setInstancetype(createQualifier("0021", "Dataset", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
else
|
||||
i.setInstancetype(createQualifier("0000", "Unknown", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
if (originalInstance != null && originalInstance.getHostedby != null)
|
||||
i.setHostedby(originalInstance.getHostedby)
|
||||
|
||||
i.setAccessright(createAccessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
|
||||
i.setDateofacceptance(doa)
|
||||
|
||||
i
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def patchRelevantDate(d: StructuredProperty): StructuredProperty = {
|
||||
d.setQualifier(createQualifier(ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE))
|
||||
d
|
||||
|
||||
}
|
||||
|
||||
def patchTitle(t: StructuredProperty): StructuredProperty = {
|
||||
t.setQualifier(ModelConstants.MAIN_TITLE_QUALIFIER)
|
||||
t
|
||||
}
|
||||
|
||||
|
||||
def convertSubject(s: StructuredProperty): StructuredProperty = {
|
||||
s.setQualifier(createQualifier("keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES))
|
||||
s
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def convertAuthor(a: Author): Author = {
|
||||
if (a == null)
|
||||
return a
|
||||
val p = new PacePerson(a.getFullname, false)
|
||||
if (p.isAccurate) {
|
||||
a.setName(p.getNameString)
|
||||
a.setSurname(p.getSurnameString)
|
||||
}
|
||||
a
|
||||
}
|
||||
|
||||
|
||||
def generateId(id: String): String = {
|
||||
val md5 = if (id.contains("::")) StringUtils.substringAfter(id, "::") else StringUtils.substringAfter(id, "|")
|
||||
s"50|scholix_____::$md5"
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
@ -1,175 +0,0 @@
|
||||
package eu.dnetlib.dhp.`export`
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.expressions.Window
|
||||
import org.apache.spark.SparkConf
|
||||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
object SparkExportContentForOpenAire {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExportContentForOpenAire.getClass.getResourceAsStream("input_export_content_parameters.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkExportContentForOpenAire.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val workingPath = parser.get("workingDirPath")
|
||||
|
||||
implicit val dliPubEncoder: Encoder[DLIPublication] = Encoders.kryo(classOf[DLIPublication])
|
||||
implicit val dliDatEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
|
||||
implicit val pubEncoder: Encoder[Publication] = Encoders.bean(classOf[Publication])
|
||||
implicit val datEncoder: Encoder[OafDataset] = Encoders.bean(classOf[OafDataset])
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation])
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
val dsRel = spark.read.load(s"$workingPath/relation_b").as[Relation]
|
||||
dsRel.filter(r => r.getDataInfo==null || r.getDataInfo.getDeletedbyinference ==false)
|
||||
.map(DLIToOAF.convertDLIRelation)
|
||||
.filter(r => r!= null)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS")
|
||||
|
||||
|
||||
val dsPubs = spark.read.load(s"$workingPath/publication").as[DLIPublication]
|
||||
dsPubs
|
||||
.filter(p=>p.getDataInfo.getDeletedbyinference == false)
|
||||
.map(DLIToOAF.convertDLIPublicationToOAF)
|
||||
.filter(p=>p!= null)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationDS")
|
||||
|
||||
|
||||
val dsDataset = spark.read.load(s"$workingPath/dataset").as[DLIDataset]
|
||||
dsDataset
|
||||
.filter(p => p.getDataInfo.getDeletedbyinference == false)
|
||||
.map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetDS")
|
||||
|
||||
|
||||
|
||||
|
||||
val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/export/publicationDS").as[Publication]
|
||||
val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/export/datasetDS").as[OafDataset]
|
||||
val relDS1 :Dataset[Relation] = spark.read.load(s"$workingPath/export/relationDS").as[Relation]
|
||||
|
||||
|
||||
val pub_id = pubs.select("id").distinct()
|
||||
val dat_id = dats.select("id").distinct()
|
||||
|
||||
|
||||
pub_id.joinWith(relDS1, pub_id("id").equalTo(relDS1("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS_f1")
|
||||
|
||||
val relDS2= spark.read.load(s"$workingPath/export/relationDS_f1").as[Relation]
|
||||
|
||||
relDS2.joinWith(dat_id, relDS2("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS_filtered")
|
||||
|
||||
|
||||
val r_source = relDS2.select(relDS2("source")).distinct()
|
||||
val r_target = relDS2.select(relDS2("target")).distinct()
|
||||
|
||||
|
||||
val w2 = Window.partitionBy("id").orderBy("lastupdatetimestamp")
|
||||
|
||||
pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1)
|
||||
.withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row")
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationDS_filtered")
|
||||
|
||||
dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1)
|
||||
.withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row")
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetAS")
|
||||
|
||||
|
||||
dsDataset.map(DLIToOAF.convertDLIDatasetToExternalReference).filter(p => p != null).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/externalReference")
|
||||
|
||||
val pf = spark.read.load(s"$workingPath/export/publicationDS_filtered").select("id")
|
||||
val relDS3 = spark.read.load(s"$workingPath/export/relationDS").as[Relation]
|
||||
val relationTo = pf.joinWith(relDS3, pf("id").equalTo(relDS3("source")),"inner").map(t =>t._2)
|
||||
|
||||
val extRef = spark.read.load(s"$workingPath/export/externalReference").as[DLIExternalReference]
|
||||
|
||||
spark.createDataset(relationTo.joinWith(extRef, relationTo("target").equalTo(extRef("id")), "inner").map(d => {
|
||||
val r = d._1
|
||||
val ext = d._2
|
||||
(r.getSource, ext)
|
||||
}).rdd.groupByKey.map(f => {
|
||||
var dli_ext = ArrayBuffer[DLIExternalReference]()
|
||||
f._2.foreach(d => if (dli_ext.size < 100) dli_ext += d )
|
||||
(f._1, dli_ext)
|
||||
})).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/externalReference_grouped")
|
||||
|
||||
val pubf :Dataset[Publication] = spark.read.load(s"$workingPath/export/publicationDS_filtered").as[Publication]
|
||||
|
||||
val groupedERf:Dataset[(String, List[DLIExternalReference])]= spark.read.load(s"$workingPath/export/externalReference_grouped").as[(String, List[DLIExternalReference])]
|
||||
|
||||
groupedERf.joinWith(pubf,pubf("id").equalTo(groupedERf("_1"))).map(t =>
|
||||
{
|
||||
val publication = t._2
|
||||
if (t._1 != null) {
|
||||
val eRefs = t._1._2
|
||||
DLIToOAF.insertExternalRefs(publication, eRefs)
|
||||
|
||||
} else
|
||||
publication
|
||||
}
|
||||
).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationAS")
|
||||
|
||||
|
||||
dsDataset
|
||||
.map(DLIToOAF.convertClinicalTrial)
|
||||
.filter(p => p != null)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/export/clinicalTrials")
|
||||
|
||||
val ct:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/clinicalTrials").as[(String,String)]
|
||||
|
||||
val relDS= spark.read.load(s"$workingPath/export/relationDS_f1").as[Relation]
|
||||
|
||||
relDS.joinWith(ct, relDS("target").equalTo(ct("_1")), "inner")
|
||||
.map(k =>{
|
||||
val currentRel = k._1
|
||||
currentRel.setTarget(k._2._2)
|
||||
currentRel
|
||||
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/clinicalTrialsRels")
|
||||
|
||||
|
||||
val clRels:Dataset[Relation] = spark.read.load(s"$workingPath/export/clinicalTrialsRels").as[Relation]
|
||||
val rels:Dataset[Relation] = spark.read.load(s"$workingPath/export/relationDS_filtered").as[Relation]
|
||||
|
||||
rels.union(clRels).flatMap(r => {
|
||||
val inverseRel = new Relation
|
||||
inverseRel.setSource(r.getTarget)
|
||||
inverseRel.setTarget(r.getSource)
|
||||
inverseRel.setDataInfo(r.getDataInfo)
|
||||
inverseRel.setCollectedfrom(r.getCollectedfrom)
|
||||
inverseRel.setRelType(r.getRelType)
|
||||
inverseRel.setSubRelType(r.getSubRelType)
|
||||
inverseRel.setRelClass(DLIToOAF.rel_inverse(r.getRelClass))
|
||||
List(r, inverseRel)
|
||||
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationAS")
|
||||
|
||||
|
||||
|
||||
spark.read.load(s"$workingPath/export/publicationAS").as[Publication].map(DLIToOAF.fixInstance).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationAS_fixed")
|
||||
spark.read.load(s"$workingPath/export/datasetAS").as[OafDataset].map(DLIToOAF.fixInstanceDataset).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetAS_fixed")
|
||||
|
||||
val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/relationAS").as[Relation].map(DLIToOAF.toActionSet)
|
||||
val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/publicationAS_fixed").as[Publication].map(DLIToOAF.toActionSet)
|
||||
val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/datasetAS_fixed").as[OafDataset].map(DLIToOAF.toActionSet)
|
||||
|
||||
fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/export/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
}
|
||||
|
||||
}
|
@ -1,112 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.export.zenodo;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.MakeTarArchive;
|
||||
|
||||
public class MakeTar implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MakeTar.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
MakeTar.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/export/input_maketar_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String outputPath = parser.get("targetPath");
|
||||
log.info("hdfsPath: {}", outputPath);
|
||||
|
||||
final String hdfsNameNode = parser.get("nameNode");
|
||||
log.info("nameNode: {}", hdfsNameNode);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("input path : {}", inputPath);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
MakeTarArchive.tarMaxSize(fileSystem, inputPath, outputPath, "scholix_dump", 25);
|
||||
|
||||
}
|
||||
|
||||
// public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException {
|
||||
//
|
||||
// RemoteIterator<LocatedFileStatus> dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath));
|
||||
//
|
||||
// while (dir_iterator.hasNext()) {
|
||||
// LocatedFileStatus fileStatus = dir_iterator.next();
|
||||
//
|
||||
// Path p = fileStatus.getPath();
|
||||
// String p_string = p.toString();
|
||||
// String entity = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
//
|
||||
// write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity);
|
||||
// }
|
||||
//
|
||||
// }
|
||||
//
|
||||
// private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
|
||||
// throws IOException {
|
||||
//
|
||||
// Path hdfsWritePath = new Path(outputPath);
|
||||
// FSDataOutputStream fsDataOutputStream = null;
|
||||
// if (fileSystem.exists(hdfsWritePath)) {
|
||||
// fileSystem.delete(hdfsWritePath, true);
|
||||
//
|
||||
// }
|
||||
// fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
//
|
||||
// TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
|
||||
//
|
||||
// RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
// .listFiles(
|
||||
// new Path(inputPath), true);
|
||||
//
|
||||
// while (fileStatusListIterator.hasNext()) {
|
||||
// LocatedFileStatus fileStatus = fileStatusListIterator.next();
|
||||
//
|
||||
// Path p = fileStatus.getPath();
|
||||
// String p_string = p.toString();
|
||||
// if (!p_string.endsWith("_SUCCESS")) {
|
||||
// String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
// TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz");
|
||||
// entry.setSize(fileStatus.getLen());
|
||||
// ar.putArchiveEntry(entry);
|
||||
//
|
||||
// InputStream is = fileSystem.open(fileStatus.getPath());
|
||||
//
|
||||
// BufferedInputStream bis = new BufferedInputStream(is);
|
||||
//
|
||||
// int count;
|
||||
// byte data[] = new byte[1024];
|
||||
// while ((count = bis.read(data, 0, data.length)) != -1) {
|
||||
// ar.write(data, 0, count);
|
||||
// }
|
||||
// bis.close();
|
||||
// ar.closeArchiveEntry();
|
||||
//
|
||||
// }
|
||||
//
|
||||
// }
|
||||
//
|
||||
// ar.close();
|
||||
// }
|
||||
|
||||
}
|
@ -1,80 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.export.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.*;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.api.MissingConceptDoiException;
|
||||
import eu.dnetlib.dhp.common.api.ZenodoAPIClient;
|
||||
|
||||
public class SendToZenodoHDFS implements Serializable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception, MissingConceptDoiException {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SendToZenodoHDFS.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/export/upload_zenodo.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("nameNode");
|
||||
final String access_token = parser.get("accessToken");
|
||||
final String connection_url = parser.get("connectionUrl");
|
||||
final String metadata = parser.get("metadata");
|
||||
final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition"));
|
||||
final String concept_rec_id = Optional
|
||||
.ofNullable(parser.get("conceptRecordId"))
|
||||
.orElse(null);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
.listFiles(
|
||||
new Path(hdfsPath), true);
|
||||
ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token);
|
||||
if (newDeposition) {
|
||||
zenodoApiClient.newDeposition();
|
||||
} else {
|
||||
if (concept_rec_id == null) {
|
||||
throw new MissingConceptDoiException("No concept record id has been provided");
|
||||
}
|
||||
zenodoApiClient.newVersion(concept_rec_id);
|
||||
}
|
||||
|
||||
while (fileStatusListIterator.hasNext()) {
|
||||
LocatedFileStatus fileStatus = fileStatusListIterator.next();
|
||||
|
||||
Path p = fileStatus.getPath();
|
||||
String p_string = p.toString();
|
||||
if (!p_string.endsWith("_SUCCESS")) {
|
||||
// String tmp = p_string.substring(0, p_string.lastIndexOf("/"));
|
||||
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
log.info("Sending information for community: " + name);
|
||||
FSDataInputStream inputStream = fileSystem.open(p);
|
||||
zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
zenodoApiClient.sendMretadata(metadata);
|
||||
// zenodoApiClient.publish();
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,98 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpDelete;
|
||||
import org.apache.http.client.methods.HttpPut;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class DropAndCreateESIndex {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
DropAndCreateESIndex.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/provision/dropAndCreateIndex.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String index = parser.get("index");
|
||||
|
||||
final String cluster = parser.get("cluster");
|
||||
final String clusterJson = IOUtils
|
||||
.toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/cluster.json"));
|
||||
|
||||
final Map<String, String> clusterMap = new ObjectMapper().readValue(clusterJson, Map.class);
|
||||
|
||||
final String ip = clusterMap.get(cluster).split(",")[0];
|
||||
|
||||
System.out.println(ip);
|
||||
|
||||
final String url = "http://%s:9200/%s_%s";
|
||||
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
|
||||
HttpDelete delete = new HttpDelete(String.format(url, ip, index, "object"));
|
||||
|
||||
CloseableHttpResponse response = client.execute(delete);
|
||||
|
||||
System.out.println("deleting Index SUMMARY");
|
||||
System.out.println(response.getStatusLine());
|
||||
client.close();
|
||||
client = HttpClients.createDefault();
|
||||
|
||||
delete = new HttpDelete(String.format(url, ip, index, "scholix"));
|
||||
|
||||
response = client.execute(delete);
|
||||
|
||||
System.out.println("deleting Index SCHOLIX");
|
||||
System.out.println(response.getStatusLine());
|
||||
client.close();
|
||||
client = HttpClients.createDefault();
|
||||
|
||||
final String summaryConf = IOUtils
|
||||
.toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/summary_index.json"));
|
||||
|
||||
final String scholixConf = IOUtils
|
||||
.toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/scholix_index.json"));
|
||||
|
||||
HttpPut put = new HttpPut(String.format(url, ip, index, "object"));
|
||||
|
||||
StringEntity entity = new StringEntity(summaryConf);
|
||||
put.setEntity(entity);
|
||||
put.setHeader("Accept", "application/json");
|
||||
put.setHeader("Content-type", "application/json");
|
||||
|
||||
System.out.println("creating First Index SUMMARY");
|
||||
response = client.execute(put);
|
||||
|
||||
client.close();
|
||||
client = HttpClients.createDefault();
|
||||
|
||||
System.out.println(response.getStatusLine());
|
||||
|
||||
System.out.println("creating Index SCHOLIX");
|
||||
put = new HttpPut(String.format(url, ip, index, "scholix"));
|
||||
|
||||
entity = new StringEntity(scholixConf);
|
||||
put.setEntity(entity);
|
||||
put.setHeader("Accept", "application/json");
|
||||
put.setHeader("Content-type", "application/json");
|
||||
|
||||
response = client.execute(put);
|
||||
System.out.println(response.getStatusLine());
|
||||
client.close();
|
||||
|
||||
}
|
||||
}
|
@ -1,48 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.Typology;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class ProvisionUtil {
|
||||
|
||||
public static final String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference";
|
||||
public static final String TARGETJSONPATH = "$.target";
|
||||
public static final String SOURCEJSONPATH = "$.source";
|
||||
|
||||
// public static RelatedItemInfo getItemType(final String item, final String idPath) {
|
||||
// String targetId = DHPUtils.getJPathString(idPath, item);
|
||||
// switch (StringUtils.substringBefore(targetId, "|")) {
|
||||
// case "50":
|
||||
// return new RelatedItemInfo(null,0,1,0);
|
||||
// case "60":
|
||||
// return new RelatedItemInfo(null,1,0,0);
|
||||
// case "70":
|
||||
// return new RelatedItemInfo(null,0,0,1);
|
||||
// default:
|
||||
// throw new RuntimeException("Unknonw target ID");
|
||||
//
|
||||
// }
|
||||
//
|
||||
// }
|
||||
|
||||
public static Boolean isNotDeleted(final String item) {
|
||||
return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item));
|
||||
}
|
||||
|
||||
public static Typology getItemTypeFromId(String id) {
|
||||
|
||||
switch (StringUtils.substringBefore(id, "|")) {
|
||||
case "50":
|
||||
return Typology.publication;
|
||||
case "60":
|
||||
return Typology.dataset;
|
||||
case "70":
|
||||
return Typology.unknown;
|
||||
default:
|
||||
throw new RuntimeException("Unknonw ID type");
|
||||
}
|
||||
}
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/** This class models the information of related items */
|
||||
public class RelatedItemInfo implements Serializable {
|
||||
|
||||
private String source;
|
||||
|
||||
private long relatedDataset = 0;
|
||||
|
||||
private long relatedPublication = 0;
|
||||
|
||||
private long relatedUnknown = 0;
|
||||
|
||||
public RelatedItemInfo() {
|
||||
}
|
||||
|
||||
public RelatedItemInfo(
|
||||
String source, long relatedDataset, long relatedPublication, long relatedUnknown) {
|
||||
this.source = source;
|
||||
this.relatedDataset = relatedDataset;
|
||||
this.relatedPublication = relatedPublication;
|
||||
this.relatedUnknown = relatedUnknown;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public void setSource(String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public long getRelatedDataset() {
|
||||
return relatedDataset;
|
||||
}
|
||||
|
||||
public void setRelatedDataset(long relatedDataset) {
|
||||
this.relatedDataset = relatedDataset;
|
||||
}
|
||||
|
||||
public long getRelatedPublication() {
|
||||
return relatedPublication;
|
||||
}
|
||||
|
||||
public void setRelatedPublication(long relatedPublication) {
|
||||
this.relatedPublication = relatedPublication;
|
||||
}
|
||||
|
||||
public long getRelatedUnknown() {
|
||||
return relatedUnknown;
|
||||
}
|
||||
|
||||
public void setRelatedUnknown(int relatedUnknown) {
|
||||
this.relatedUnknown = relatedUnknown;
|
||||
}
|
||||
}
|
@ -1,38 +0,0 @@
|
||||
package eu.dnetlib.dhp.provision
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
|
||||
object SparkConvertDatasetToJson {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertDatasetToJson.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/dataset2Json.json")))
|
||||
parser.parseArgument(args)
|
||||
val conf = new SparkConf
|
||||
val spark = SparkSession.builder.config(conf).appName(SparkConvertDatasetToJson.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
|
||||
|
||||
|
||||
spark.read.load(s"$workingPath/summary").as[ScholixSummary]
|
||||
.map(s => new ObjectMapper().writeValueAsString(s))(Encoders.STRING)
|
||||
.rdd.repartition(500).saveAsTextFile(s"$workingPath/summary_json", classOf[GzipCodec])
|
||||
|
||||
spark.read.load(s"$workingPath/scholix").as[Scholix]
|
||||
.map(s => new ObjectMapper().writeValueAsString(s))(Encoders.STRING)
|
||||
.rdd.repartition(2000).saveAsTextFile(s"$workingPath/scholix_json", classOf[GzipCodec])
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,60 +0,0 @@
|
||||
package eu.dnetlib.dhp.provision
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.apache.spark.sql.functions.{coalesce, col, count, lit}
|
||||
|
||||
|
||||
/**
|
||||
* SparkExtractRelationCount is a spark job that takes in input relation RDD and retrieve for each item in relation
|
||||
* which are the number of - Related Dataset - Related Publication - Related Unknown
|
||||
*/
|
||||
object SparkExtractRelationCount {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExtractRelationCount.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_related_entities_parameters.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark = SparkSession.builder.appName(SparkExtractRelationCount.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
|
||||
val workingDirPath = parser.get("workingDirPath")
|
||||
|
||||
val relationPath = parser.get("relationPath")
|
||||
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
|
||||
val relation = spark.read.load(relationPath).as[Relation].map(r =>r)(Encoders.bean(classOf[Relation]))
|
||||
|
||||
val relatedPublication = relation
|
||||
.where("target like '50%'")
|
||||
.groupBy("source")
|
||||
.agg(count("target").as("publication"))
|
||||
.select(col("source"). alias("p_source"), col("publication"))
|
||||
val relatedDataset = relation
|
||||
.where("target like '60%'")
|
||||
.groupBy("source")
|
||||
.agg(count("target").as("dataset"))
|
||||
.select(col("source"). alias("d_source"), col("dataset"))
|
||||
val relatedUnknown = relation
|
||||
.where("target like '70%'")
|
||||
.groupBy("source")
|
||||
.agg(count("target").as("unknown"))
|
||||
.select(col("source"). alias("u_source"), col("unknown"))
|
||||
val firstJoin = relatedPublication
|
||||
.join(relatedDataset,col("p_source").equalTo(col("d_source")),"full")
|
||||
.select( coalesce( col("p_source"), col("d_source")).alias("id"),
|
||||
col("publication"),
|
||||
col("dataset"))
|
||||
.join(relatedUnknown, col("u_source").equalTo(col("id")),"full")
|
||||
.select( coalesce(col("u_source"), col("id")).alias("source"),
|
||||
coalesce(col("publication"),lit(0)).alias("relatedPublication"),
|
||||
coalesce(col("dataset"),lit(0)).alias("relatedDataset"),
|
||||
coalesce(col("unknown"),lit(0)).alias("relatedUnknown")
|
||||
)
|
||||
firstJoin.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/relatedItemCount")
|
||||
}
|
||||
|
||||
}
|
@ -1,94 +0,0 @@
|
||||
package eu.dnetlib.dhp.provision
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixResource}
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
|
||||
object SparkGenerateScholixIndex {
|
||||
|
||||
|
||||
|
||||
def getScholixAggregator(): Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix]{
|
||||
|
||||
override def zero: Scholix = new Scholix()
|
||||
|
||||
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
|
||||
b.mergeFrom(a._2)
|
||||
b
|
||||
}
|
||||
|
||||
override def merge(wx: Scholix, wy: Scholix): Scholix = {
|
||||
wx.mergeFrom(wy)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: Scholix): Scholix = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Scholix] =
|
||||
Encoders.kryo(classOf[Scholix])
|
||||
|
||||
override def outputEncoder: Encoder[Scholix] =
|
||||
Encoders.kryo(classOf[Scholix])
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholixIndex.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")))
|
||||
parser.parseArgument(args)
|
||||
val conf = new SparkConf
|
||||
conf.set("spark.sql.shuffle.partitions", "4000")
|
||||
val spark = SparkSession.builder.config(conf).appName(SparkGenerateScholixIndex.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
|
||||
val graphPath = parser.get("graphPath")
|
||||
val workingDirPath = parser.get("workingDirPath")
|
||||
|
||||
|
||||
implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val relEncoder:Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val scholixEncoder:Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val tupleScholix:Encoder[(String,Scholix)]=Encoders.tuple(Encoders.STRING, scholixEncoder)
|
||||
|
||||
|
||||
val scholixSummary:Dataset[(String,ScholixSummary)] = spark.read.load(s"$workingDirPath/summary").as[ScholixSummary]
|
||||
.map(s => (s.getId, s))(Encoders.tuple(Encoders.STRING, summaryEncoder))
|
||||
val sourceRelations:Dataset[(String,Relation)]= spark.read.load(s"$graphPath/relation").as[Relation]
|
||||
.map(r => (r.getSource,r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
|
||||
scholixSummary.joinWith(sourceRelations, scholixSummary("_1").equalTo(sourceRelations("_1")), "inner")
|
||||
.map(r=> {
|
||||
val summary = r._1._2
|
||||
val relation = r._2._2
|
||||
|
||||
(relation.getTarget, Scholix.generateScholixWithSource(summary,relation))
|
||||
|
||||
}).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_source")
|
||||
|
||||
val sTarget:Dataset[(String,Scholix)] = spark.read.load(s"$workingDirPath/scholix_source").as[(String, Scholix)]
|
||||
|
||||
sTarget.joinWith(scholixSummary, sTarget("_1").equalTo(scholixSummary("_1")), "inner").map(i => {
|
||||
val summary = i._2._2
|
||||
val scholix = i._1._2
|
||||
|
||||
val scholixResource = ScholixResource.fromSummary(summary)
|
||||
scholix.setTarget(scholixResource)
|
||||
scholix.generateIdentifier()
|
||||
scholix.generatelinkPublisher()
|
||||
scholix
|
||||
}).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_r")
|
||||
|
||||
|
||||
val finalScholix:Dataset[Scholix] = spark.read.load(s"$workingDirPath/scholix_r").as[Scholix]
|
||||
|
||||
finalScholix.map(d => (d.getIdentifier, d))(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(getScholixAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix")
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,70 +0,0 @@
|
||||
package eu.dnetlib.dhp.provision
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
|
||||
object SparkGenerateSummaryIndex {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateSummaryIndex.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark = SparkSession.builder.appName(SparkGenerateSummaryIndex.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
|
||||
val graphPath = parser.get("graphPath")
|
||||
val workingDirPath = parser.get("workingDirPath")
|
||||
|
||||
implicit val relatedItemInfoEncoders: Encoder[RelatedItemInfo] = Encoders.bean(classOf[RelatedItemInfo])
|
||||
implicit val datasetEncoder:Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
|
||||
implicit val publicationEncoder:Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
|
||||
implicit val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val oafWithIdEncoder: Encoder[(String, Oaf)] = Encoders.tuple(Encoders.STRING, oafEncoder)
|
||||
implicit val scholixSummaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val scholixSummaryEncoderTuple: Encoder[(String,ScholixSummary)] = Encoders.tuple(Encoders.STRING,scholixSummaryEncoder)
|
||||
|
||||
|
||||
val pubs = spark.read.load(s"$graphPath/publication").as[Oaf].map(o => (o.asInstanceOf[DLIPublication].getId, o))
|
||||
val dats = spark.read.load(s"$graphPath/dataset").as[Oaf].map(o => (o.asInstanceOf[DLIDataset].getId, o))
|
||||
val ukn = spark.read.load(s"$graphPath/unknown").as[Oaf].map(o => (o.asInstanceOf[DLIUnknown].getId, o))
|
||||
|
||||
|
||||
val summary:Dataset[(String,ScholixSummary)] = pubs.union(dats).union(ukn).map(o =>{
|
||||
val s = ScholixSummary.fromOAF(o._2)
|
||||
(s.getId,s)
|
||||
})
|
||||
|
||||
|
||||
val relatedItemInfoDs:Dataset[RelatedItemInfo] = spark.read.load(s"$workingDirPath/relatedItemCount").as[RelatedItemInfo]
|
||||
|
||||
|
||||
summary.joinWith(relatedItemInfoDs, summary("_1").equalTo(relatedItemInfoDs("source")), "inner")
|
||||
.map(i => {
|
||||
val summary = i._1._2
|
||||
val relatedItemInfo = i._2
|
||||
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset)
|
||||
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication)
|
||||
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown)
|
||||
summary
|
||||
}).filter(s => s.getLocalIdentifier != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/summary")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class
|
||||
SparkIndexCollectionOnES {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkIndexCollectionOnES.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/provision/index_on_es.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
SparkConf conf = new SparkConf()
|
||||
.setAppName(SparkIndexCollectionOnES.class.getSimpleName())
|
||||
.setMaster(parser.get("master"));
|
||||
|
||||
conf.set("spark.sql.shuffle.partitions", "4000");
|
||||
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
final String index = parser.get("index");
|
||||
final String idPath = parser.get("idPath");
|
||||
final String cluster = parser.get("cluster");
|
||||
final String clusterJson = IOUtils
|
||||
.toString(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/provision/cluster.json"));
|
||||
|
||||
final Map<String, String> clusterMap = new ObjectMapper().readValue(clusterJson, Map.class);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<String> inputRdd = sc.textFile(sourcePath);
|
||||
|
||||
Map<String, String> esCfg = new HashMap<>();
|
||||
esCfg.put("es.nodes", clusterMap.get(cluster));
|
||||
esCfg.put("es.mapping.id", idPath);
|
||||
esCfg.put("es.batch.write.retry.count", "8");
|
||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
||||
esCfg.put("es.batch.size.entries", "200");
|
||||
esCfg.put("es.nodes.wan.only", "true");
|
||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||
}
|
||||
}
|
@ -1,286 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class Scholix implements Serializable {
|
||||
private String publicationDate;
|
||||
|
||||
private List<ScholixEntityId> publisher;
|
||||
|
||||
private List<ScholixEntityId> linkprovider;
|
||||
|
||||
private ScholixRelationship relationship;
|
||||
|
||||
private ScholixResource source;
|
||||
|
||||
private ScholixResource target;
|
||||
|
||||
private String identifier;
|
||||
|
||||
public Scholix clone(final ScholixResource t) {
|
||||
final Scholix clone = new Scholix();
|
||||
clone.setPublicationDate(publicationDate);
|
||||
clone.setPublisher(publisher);
|
||||
clone.setLinkprovider(linkprovider);
|
||||
clone.setRelationship(relationship);
|
||||
clone.setSource(source);
|
||||
clone.setTarget(t);
|
||||
clone.generatelinkPublisher();
|
||||
clone.generateIdentifier();
|
||||
return clone;
|
||||
}
|
||||
|
||||
public static Scholix generateScholixWithSource(
|
||||
final String sourceSummaryJson, final String relation) {
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
try {
|
||||
ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class);
|
||||
Relation rel = mapper.readValue(relation, Relation.class);
|
||||
final Scholix s = new Scholix();
|
||||
if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0)
|
||||
s.setPublicationDate(scholixSummary.getDate().get(0));
|
||||
s
|
||||
.setLinkprovider(
|
||||
rel
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(
|
||||
cf -> new ScholixEntityId(
|
||||
cf.getValue(),
|
||||
Collections
|
||||
.singletonList(
|
||||
new ScholixIdentifier(cf.getKey(), "dnet_identifier"))))
|
||||
.collect(Collectors.toList()));
|
||||
s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null));
|
||||
s.setSource(ScholixResource.fromSummary(scholixSummary));
|
||||
return s;
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(
|
||||
String.format("Summary: %s \n relation:%s", sourceSummaryJson, relation), e);
|
||||
}
|
||||
}
|
||||
|
||||
public static Scholix generateScholixWithSource(
|
||||
final ScholixSummary scholixSummary, final Relation rel) {
|
||||
final Scholix s = new Scholix();
|
||||
if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0)
|
||||
s.setPublicationDate(scholixSummary.getDate().get(0));
|
||||
s
|
||||
.setLinkprovider(
|
||||
rel
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(
|
||||
cf -> new ScholixEntityId(
|
||||
cf.getValue(),
|
||||
Collections
|
||||
.singletonList(
|
||||
new ScholixIdentifier(cf.getKey(), "dnet_identifier"))))
|
||||
.collect(Collectors.toList()));
|
||||
s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null));
|
||||
s.setSource(ScholixResource.fromSummary(scholixSummary));
|
||||
|
||||
s.setIdentifier(rel.getTarget());
|
||||
return s;
|
||||
}
|
||||
|
||||
private List<ScholixEntityId> mergeScholixEntityId(final List<ScholixEntityId> a, final List<ScholixEntityId> b) {
|
||||
final List<ScholixEntityId> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
|
||||
if (b != null)
|
||||
b.forEach(s -> {
|
||||
if (s != null) {
|
||||
int tt = (int) m
|
||||
.stream()
|
||||
.filter(t -> t != null && t.getName() != null && t.getName().equalsIgnoreCase(s.getName()))
|
||||
.count();
|
||||
if (tt == 0) {
|
||||
m.add(s);
|
||||
}
|
||||
}
|
||||
});
|
||||
return m;
|
||||
}
|
||||
|
||||
private List<ScholixIdentifier> mergeScholixIdnetifier(final List<ScholixIdentifier> a,
|
||||
final List<ScholixIdentifier> b) {
|
||||
final List<ScholixIdentifier> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
|
||||
if (b != null)
|
||||
b.forEach(s -> {
|
||||
int tt = (int) m.stream().filter(t -> t.getIdentifier().equalsIgnoreCase(s.getIdentifier())).count();
|
||||
if (tt == 0) {
|
||||
m.add(s);
|
||||
}
|
||||
});
|
||||
return m;
|
||||
}
|
||||
|
||||
private List<ScholixCollectedFrom> mergeScholixCollectedFrom(final List<ScholixCollectedFrom> a,
|
||||
final List<ScholixCollectedFrom> b) {
|
||||
final List<ScholixCollectedFrom> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
|
||||
if (b != null)
|
||||
b.forEach(s -> {
|
||||
int tt = (int) m
|
||||
.stream()
|
||||
.filter(t -> t.getProvider().getName().equalsIgnoreCase(s.getProvider().getName()))
|
||||
.count();
|
||||
if (tt == 0) {
|
||||
m.add(s);
|
||||
}
|
||||
});
|
||||
return m;
|
||||
}
|
||||
|
||||
private ScholixRelationship mergeRelationships(final ScholixRelationship a, final ScholixRelationship b) {
|
||||
ScholixRelationship result = new ScholixRelationship();
|
||||
result.setName(a == null || StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName());
|
||||
result.setInverse(a == null || StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse());
|
||||
result.setSchema(a == null || StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema());
|
||||
return result;
|
||||
}
|
||||
|
||||
private ScholixResource mergeResource(final ScholixResource a, final ScholixResource b) {
|
||||
if (a == null)
|
||||
return b;
|
||||
final ScholixResource result = new ScholixResource();
|
||||
result.setCollectedFrom(mergeScholixCollectedFrom(a.getCollectedFrom(), b.getCollectedFrom()));
|
||||
result.setCreator(mergeScholixEntityId(a.getCreator(), b.getCreator()));
|
||||
result
|
||||
.setDnetIdentifier(
|
||||
StringUtils.isBlank(a.getDnetIdentifier()) ? b.getDnetIdentifier() : a.getDnetIdentifier());
|
||||
result.setIdentifier(mergeScholixIdnetifier(a.getIdentifier(), b.getIdentifier()));
|
||||
result.setObjectType(StringUtils.isNotBlank(a.getObjectType()) ? a.getObjectType() : b.getObjectType());
|
||||
result
|
||||
.setObjectSubType(
|
||||
StringUtils.isNotBlank(a.getObjectSubType()) ? a.getObjectSubType() : b.getObjectSubType());
|
||||
result.setPublisher(mergeScholixEntityId(a.getPublisher(), b.getPublisher()));
|
||||
result
|
||||
.setPublicationDate(
|
||||
StringUtils.isNotBlank(a.getPublicationDate()) ? a.getPublicationDate() : b.getPublicationDate());
|
||||
result.setTitle(StringUtils.isNotBlank(a.getTitle()) ? a.getTitle() : b.getTitle());
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
public void mergeFrom(final Scholix other) {
|
||||
linkprovider = mergeScholixEntityId(linkprovider, other.getLinkprovider());
|
||||
publisher = mergeScholixEntityId(publisher, other.getPublisher());
|
||||
if (StringUtils.isEmpty(publicationDate))
|
||||
publicationDate = other.getPublicationDate();
|
||||
relationship = mergeRelationships(relationship, other.getRelationship());
|
||||
source = mergeResource(source, other.getSource());
|
||||
target = mergeResource(target, other.getTarget());
|
||||
generateIdentifier();
|
||||
}
|
||||
|
||||
public void generatelinkPublisher() {
|
||||
Set<String> publisher = new HashSet<>();
|
||||
if (source.getPublisher() != null)
|
||||
publisher
|
||||
.addAll(
|
||||
source
|
||||
.getPublisher()
|
||||
.stream()
|
||||
.map(ScholixEntityId::getName)
|
||||
.collect(Collectors.toList()));
|
||||
if (target.getPublisher() != null)
|
||||
publisher
|
||||
.addAll(
|
||||
target
|
||||
.getPublisher()
|
||||
.stream()
|
||||
.map(ScholixEntityId::getName)
|
||||
.collect(Collectors.toList()));
|
||||
this.publisher = publisher.stream().map(k -> new ScholixEntityId(k, null)).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public void generateIdentifier() {
|
||||
setIdentifier(
|
||||
DHPUtils
|
||||
.md5(
|
||||
String
|
||||
.format(
|
||||
"%s::%s::%s",
|
||||
source.getDnetIdentifier(), relationship.getName(), target.getDnetIdentifier())));
|
||||
}
|
||||
|
||||
public Scholix addTarget(final String targetSummaryJson) {
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
try {
|
||||
ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class);
|
||||
setTarget(ScholixResource.fromSummary(targetSummary));
|
||||
generateIdentifier();
|
||||
return this;
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public String getPublicationDate() {
|
||||
return publicationDate;
|
||||
}
|
||||
|
||||
public void setPublicationDate(String publicationDate) {
|
||||
this.publicationDate = publicationDate;
|
||||
}
|
||||
|
||||
public List<ScholixEntityId> getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(List<ScholixEntityId> publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public List<ScholixEntityId> getLinkprovider() {
|
||||
return linkprovider;
|
||||
}
|
||||
|
||||
public void setLinkprovider(List<ScholixEntityId> linkprovider) {
|
||||
this.linkprovider = linkprovider;
|
||||
}
|
||||
|
||||
public ScholixRelationship getRelationship() {
|
||||
return relationship;
|
||||
}
|
||||
|
||||
public void setRelationship(ScholixRelationship relationship) {
|
||||
this.relationship = relationship;
|
||||
}
|
||||
|
||||
public ScholixResource getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public void setSource(ScholixResource source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public ScholixResource getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
public void setTarget(ScholixResource target) {
|
||||
this.target = target;
|
||||
}
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class ScholixCollectedFrom implements Serializable {
|
||||
|
||||
private ScholixEntityId provider;
|
||||
private String provisionMode;
|
||||
private String completionStatus;
|
||||
|
||||
public ScholixCollectedFrom() {
|
||||
}
|
||||
|
||||
public ScholixCollectedFrom(
|
||||
ScholixEntityId provider, String provisionMode, String completionStatus) {
|
||||
this.provider = provider;
|
||||
this.provisionMode = provisionMode;
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
|
||||
public ScholixEntityId getProvider() {
|
||||
return provider;
|
||||
}
|
||||
|
||||
public void setProvider(ScholixEntityId provider) {
|
||||
this.provider = provider;
|
||||
}
|
||||
|
||||
public String getProvisionMode() {
|
||||
return provisionMode;
|
||||
}
|
||||
|
||||
public void setProvisionMode(String provisionMode) {
|
||||
this.provisionMode = provisionMode;
|
||||
}
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class ScholixEntityId implements Serializable {
|
||||
private String name;
|
||||
private List<ScholixIdentifier> identifiers;
|
||||
|
||||
public ScholixEntityId() {
|
||||
}
|
||||
|
||||
public ScholixEntityId(String name, List<ScholixIdentifier> identifiers) {
|
||||
this.name = name;
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public List<ScholixIdentifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setIdentifiers(List<ScholixIdentifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class ScholixIdentifier implements Serializable {
|
||||
private String identifier;
|
||||
private String schema;
|
||||
|
||||
public ScholixIdentifier() {
|
||||
}
|
||||
|
||||
public ScholixIdentifier(String identifier, String schema) {
|
||||
this.identifier = identifier;
|
||||
this.schema = schema;
|
||||
}
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
public String getSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
public void setSchema(String schema) {
|
||||
this.schema = schema;
|
||||
}
|
||||
}
|
@ -1,43 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class ScholixRelationship implements Serializable {
|
||||
private String name;
|
||||
private String schema;
|
||||
private String inverse;
|
||||
|
||||
public ScholixRelationship() {
|
||||
}
|
||||
|
||||
public ScholixRelationship(String name, String schema, String inverse) {
|
||||
this.name = name;
|
||||
this.schema = schema;
|
||||
this.inverse = inverse;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
public void setSchema(String schema) {
|
||||
this.schema = schema;
|
||||
}
|
||||
|
||||
public String getInverse() {
|
||||
return inverse;
|
||||
}
|
||||
|
||||
public void setInverse(String inverse) {
|
||||
this.inverse = inverse;
|
||||
}
|
||||
}
|
@ -1,151 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||
|
||||
public class ScholixResource implements Serializable {
|
||||
|
||||
private List<ScholixIdentifier> identifier;
|
||||
private String dnetIdentifier;
|
||||
private String objectType;
|
||||
private String objectSubType;
|
||||
private String title;
|
||||
private List<ScholixEntityId> creator;
|
||||
private String publicationDate;
|
||||
private List<ScholixEntityId> publisher;
|
||||
private List<ScholixCollectedFrom> collectedFrom;
|
||||
|
||||
public static ScholixResource fromSummary(ScholixSummary summary) {
|
||||
|
||||
final ScholixResource resource = new ScholixResource();
|
||||
|
||||
resource.setDnetIdentifier(summary.getId());
|
||||
|
||||
resource
|
||||
.setIdentifier(
|
||||
summary
|
||||
.getLocalIdentifier()
|
||||
.stream()
|
||||
.map(i -> new ScholixIdentifier(i.getId(), i.getType()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
resource.setObjectType(summary.getTypology().toString());
|
||||
|
||||
if (summary.getTitle() != null && summary.getTitle().size() > 0)
|
||||
resource.setTitle(summary.getTitle().get(0));
|
||||
|
||||
if (summary.getAuthor() != null)
|
||||
resource
|
||||
.setCreator(
|
||||
summary
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.map(c -> new ScholixEntityId(c, null))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
if (summary.getDate() != null && summary.getDate().size() > 0)
|
||||
resource.setPublicationDate(summary.getDate().get(0));
|
||||
if (summary.getPublisher() != null)
|
||||
resource
|
||||
.setPublisher(
|
||||
summary
|
||||
.getPublisher()
|
||||
.stream()
|
||||
.map(p -> new ScholixEntityId(p, null))
|
||||
.collect(Collectors.toList()));
|
||||
if (summary.getDatasources() != null)
|
||||
resource
|
||||
.setCollectedFrom(
|
||||
summary
|
||||
.getDatasources()
|
||||
.stream()
|
||||
.map(
|
||||
d -> new ScholixCollectedFrom(
|
||||
new ScholixEntityId(
|
||||
d.getDatasourceName(),
|
||||
Collections
|
||||
.singletonList(
|
||||
new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))),
|
||||
"collected",
|
||||
d.getCompletionStatus()))
|
||||
.collect(Collectors.toList()));
|
||||
return resource;
|
||||
}
|
||||
|
||||
public List<ScholixIdentifier> getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(List<ScholixIdentifier> identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
public String getDnetIdentifier() {
|
||||
return dnetIdentifier;
|
||||
}
|
||||
|
||||
public void setDnetIdentifier(String dnetIdentifier) {
|
||||
this.dnetIdentifier = dnetIdentifier;
|
||||
}
|
||||
|
||||
public String getObjectType() {
|
||||
return objectType;
|
||||
}
|
||||
|
||||
public void setObjectType(String objectType) {
|
||||
this.objectType = objectType;
|
||||
}
|
||||
|
||||
public String getObjectSubType() {
|
||||
return objectSubType;
|
||||
}
|
||||
|
||||
public void setObjectSubType(String objectSubType) {
|
||||
this.objectSubType = objectSubType;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public List<ScholixEntityId> getCreator() {
|
||||
return creator;
|
||||
}
|
||||
|
||||
public void setCreator(List<ScholixEntityId> creator) {
|
||||
this.creator = creator;
|
||||
}
|
||||
|
||||
public String getPublicationDate() {
|
||||
return publicationDate;
|
||||
}
|
||||
|
||||
public void setPublicationDate(String publicationDate) {
|
||||
this.publicationDate = publicationDate;
|
||||
}
|
||||
|
||||
public List<ScholixEntityId> getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(List<ScholixEntityId> publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public List<ScholixCollectedFrom> getCollectedFrom() {
|
||||
return collectedFrom;
|
||||
}
|
||||
|
||||
public void setCollectedFrom(List<ScholixCollectedFrom> collectedFrom) {
|
||||
this.collectedFrom = collectedFrom;
|
||||
}
|
||||
}
|
@ -1,44 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix.summary;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class CollectedFromType implements Serializable {
|
||||
|
||||
private String datasourceName;
|
||||
private String datasourceId;
|
||||
private String completionStatus;
|
||||
|
||||
public CollectedFromType() {
|
||||
}
|
||||
|
||||
public CollectedFromType(String datasourceName, String datasourceId, String completionStatus) {
|
||||
this.datasourceName = datasourceName;
|
||||
this.datasourceId = datasourceId;
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
|
||||
public String getDatasourceName() {
|
||||
return datasourceName;
|
||||
}
|
||||
|
||||
public void setDatasourceName(String datasourceName) {
|
||||
this.datasourceName = datasourceName;
|
||||
}
|
||||
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
public void setDatasourceId(String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix.summary;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class SchemeValue implements Serializable {
|
||||
private String scheme;
|
||||
private String value;
|
||||
|
||||
public SchemeValue() {
|
||||
}
|
||||
|
||||
public SchemeValue(String scheme, String value) {
|
||||
this.scheme = scheme;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public String getScheme() {
|
||||
return scheme;
|
||||
}
|
||||
|
||||
public void setScheme(String scheme) {
|
||||
this.scheme = scheme;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
@ -1,321 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix.summary;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.provision.RelatedItemInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
|
||||
|
||||
public class ScholixSummary implements Serializable {
|
||||
private String id;
|
||||
private List<TypedIdentifier> localIdentifier;
|
||||
private Typology typology;
|
||||
private List<String> title;
|
||||
private List<String> author;
|
||||
private List<String> date;
|
||||
private String description;
|
||||
private List<SchemeValue> subject;
|
||||
private List<String> publisher;
|
||||
private long relatedPublications;
|
||||
private long relatedDatasets;
|
||||
private long relatedUnknown;
|
||||
private List<CollectedFromType> datasources;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<TypedIdentifier> getLocalIdentifier() {
|
||||
return localIdentifier;
|
||||
}
|
||||
|
||||
public void setLocalIdentifier(List<TypedIdentifier> localIdentifier) {
|
||||
this.localIdentifier = localIdentifier;
|
||||
}
|
||||
|
||||
public Typology getTypology() {
|
||||
return typology;
|
||||
}
|
||||
|
||||
public void setTypology(Typology typology) {
|
||||
this.typology = typology;
|
||||
}
|
||||
|
||||
public List<String> getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(List<String> title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public List<String> getAuthor() {
|
||||
return author;
|
||||
}
|
||||
|
||||
public void setAuthor(List<String> author) {
|
||||
this.author = author;
|
||||
}
|
||||
|
||||
public List<String> getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(List<String> date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
@JsonProperty("abstract")
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
@JsonProperty("abstract")
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public List<SchemeValue> getSubject() {
|
||||
return subject;
|
||||
}
|
||||
|
||||
public void setSubject(List<SchemeValue> subject) {
|
||||
this.subject = subject;
|
||||
}
|
||||
|
||||
public List<String> getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(List<String> publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public long getRelatedPublications() {
|
||||
return relatedPublications;
|
||||
}
|
||||
|
||||
public void setRelatedPublications(long relatedPublications) {
|
||||
this.relatedPublications = relatedPublications;
|
||||
}
|
||||
|
||||
public long getRelatedDatasets() {
|
||||
return relatedDatasets;
|
||||
}
|
||||
|
||||
public void setRelatedDatasets(long relatedDatasets) {
|
||||
this.relatedDatasets = relatedDatasets;
|
||||
}
|
||||
|
||||
public long getRelatedUnknown() {
|
||||
return relatedUnknown;
|
||||
}
|
||||
|
||||
public void setRelatedUnknown(long relatedUnknown) {
|
||||
this.relatedUnknown = relatedUnknown;
|
||||
}
|
||||
|
||||
public List<CollectedFromType> getDatasources() {
|
||||
return datasources;
|
||||
}
|
||||
|
||||
public void setDatasources(List<CollectedFromType> datasources) {
|
||||
this.datasources = datasources;
|
||||
}
|
||||
|
||||
public static ScholixSummary fromOAF(final Oaf oaf) {
|
||||
try {
|
||||
final RelatedItemInfo relatedItemInfo = new RelatedItemInfo();
|
||||
|
||||
if (oaf instanceof DLIPublication)
|
||||
return summaryFromPublication((DLIPublication) oaf, relatedItemInfo);
|
||||
if (oaf instanceof DLIDataset)
|
||||
return summaryFromDataset((DLIDataset) oaf, relatedItemInfo);
|
||||
if (oaf instanceof DLIUnknown)
|
||||
return summaryFromUnknown((DLIUnknown) oaf, relatedItemInfo);
|
||||
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static ScholixSummary summaryFromDataset(
|
||||
final DLIDataset item, final RelatedItemInfo relatedItemInfo) {
|
||||
ScholixSummary summary = new ScholixSummary();
|
||||
summary.setId(item.getId());
|
||||
|
||||
if (item.getPid() != null)
|
||||
summary
|
||||
.setLocalIdentifier(
|
||||
item
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
summary.setTypology(Typology.dataset);
|
||||
if (item.getTitle() != null)
|
||||
summary
|
||||
.setTitle(
|
||||
item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
|
||||
|
||||
if (item.getAuthor() != null) {
|
||||
summary
|
||||
.setAuthor(
|
||||
item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (item.getRelevantdate() != null)
|
||||
summary
|
||||
.setDate(
|
||||
item
|
||||
.getRelevantdate()
|
||||
.stream()
|
||||
.filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname()))
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
if (item.getDescription() != null && item.getDescription().size() > 0)
|
||||
summary.setDescription(item.getDescription().get(0).getValue());
|
||||
|
||||
if (item.getSubject() != null) {
|
||||
summary
|
||||
.setSubject(
|
||||
item
|
||||
.getSubject()
|
||||
.stream()
|
||||
.map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (item.getPublisher() != null)
|
||||
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
|
||||
|
||||
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
|
||||
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
|
||||
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
|
||||
|
||||
if (item.getDlicollectedfrom() != null)
|
||||
summary
|
||||
.setDatasources(
|
||||
item
|
||||
.getDlicollectedfrom()
|
||||
.stream()
|
||||
.map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()))
|
||||
.collect(Collectors.toList()));
|
||||
return summary;
|
||||
}
|
||||
|
||||
private static ScholixSummary summaryFromPublication(
|
||||
final DLIPublication item, final RelatedItemInfo relatedItemInfo) {
|
||||
ScholixSummary summary = new ScholixSummary();
|
||||
summary.setId(item.getId());
|
||||
|
||||
if (item.getPid() != null)
|
||||
summary
|
||||
.setLocalIdentifier(
|
||||
item
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
summary.setTypology(Typology.publication);
|
||||
if (item.getTitle() != null)
|
||||
summary
|
||||
.setTitle(
|
||||
item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
|
||||
|
||||
if (item.getAuthor() != null) {
|
||||
summary
|
||||
.setAuthor(
|
||||
item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (item.getRelevantdate() != null)
|
||||
summary
|
||||
.setDate(
|
||||
item
|
||||
.getRelevantdate()
|
||||
.stream()
|
||||
.filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname()))
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
if (item.getDescription() != null && item.getDescription().size() > 0)
|
||||
summary.setDescription(item.getDescription().get(0).getValue());
|
||||
|
||||
if (item.getSubject() != null) {
|
||||
summary
|
||||
.setSubject(
|
||||
item
|
||||
.getSubject()
|
||||
.stream()
|
||||
.map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (item.getPublisher() != null)
|
||||
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
|
||||
|
||||
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
|
||||
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
|
||||
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
|
||||
|
||||
if (item.getDlicollectedfrom() != null)
|
||||
summary
|
||||
.setDatasources(
|
||||
item
|
||||
.getDlicollectedfrom()
|
||||
.stream()
|
||||
.map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return summary;
|
||||
}
|
||||
|
||||
private static ScholixSummary summaryFromUnknown(
|
||||
final DLIUnknown item, final RelatedItemInfo relatedItemInfo) {
|
||||
ScholixSummary summary = new ScholixSummary();
|
||||
summary.setId(item.getId());
|
||||
if (item.getPid() != null)
|
||||
summary
|
||||
.setLocalIdentifier(
|
||||
item
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
|
||||
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
|
||||
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
|
||||
summary.setTypology(Typology.unknown);
|
||||
if (item.getDlicollectedfrom() != null)
|
||||
summary
|
||||
.setDatasources(
|
||||
item
|
||||
.getDlicollectedfrom()
|
||||
.stream()
|
||||
.map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()))
|
||||
.collect(Collectors.toList()));
|
||||
return summary;
|
||||
}
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix.summary;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class TypedIdentifier implements Serializable {
|
||||
private String id;
|
||||
private String type;
|
||||
|
||||
public TypedIdentifier() {
|
||||
}
|
||||
|
||||
public TypedIdentifier(String id, String type) {
|
||||
this.id = id;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
}
|
@ -1,8 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.scholix.summary;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public enum Typology implements Serializable {
|
||||
dataset, publication, unknown
|
||||
}
|
@ -1,131 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.update;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.gson.JsonArray;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonObject;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixCollectedFrom;
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixEntityId;
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier;
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class CrossRefParserJSON {
|
||||
|
||||
private static final List<ScholixCollectedFrom> collectedFrom = generateCrossrefCollectedFrom("complete");
|
||||
|
||||
public static ScholixResource parseRecord(final String record) {
|
||||
if (record == null)
|
||||
return null;
|
||||
JsonElement jElement = new JsonParser().parse(record);
|
||||
JsonElement source = null;
|
||||
if (jElement.getAsJsonObject().has("_source")) {
|
||||
source = jElement.getAsJsonObject().get("_source");
|
||||
if (source == null || !source.isJsonObject())
|
||||
return null;
|
||||
} else if (jElement.getAsJsonObject().has("DOI")) {
|
||||
source = jElement;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
final JsonObject message = source.getAsJsonObject();
|
||||
ScholixResource currentObject = new ScholixResource();
|
||||
|
||||
if (message.get("DOI") != null) {
|
||||
final String doi = message.get("DOI").getAsString();
|
||||
currentObject.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi")));
|
||||
}
|
||||
|
||||
if ((!message.get("created").isJsonNull())
|
||||
&& (message.getAsJsonObject("created").get("date-time") != null)) {
|
||||
currentObject
|
||||
.setPublicationDate(
|
||||
message.getAsJsonObject("created").get("date-time").getAsString());
|
||||
}
|
||||
|
||||
if (message.get("title") != null
|
||||
&& !message.get("title").isJsonNull()
|
||||
&& message.get("title").isJsonArray()) {
|
||||
|
||||
JsonArray array = message.get("title").getAsJsonArray();
|
||||
currentObject.setTitle(array.get(0).getAsString());
|
||||
}
|
||||
if (message.get("author") != null && !message.get("author").isJsonNull()) {
|
||||
JsonArray author = message.getAsJsonArray("author");
|
||||
List<ScholixEntityId> authorList = new ArrayList<>();
|
||||
for (JsonElement anAuthor : author) {
|
||||
JsonObject currentAuth = anAuthor.getAsJsonObject();
|
||||
|
||||
String family = "";
|
||||
String given = "";
|
||||
if (currentAuth != null
|
||||
&& currentAuth.get("family") != null
|
||||
&& !currentAuth.get("family").isJsonNull()) {
|
||||
family = currentAuth.get("family").getAsString();
|
||||
}
|
||||
if (currentAuth != null
|
||||
&& currentAuth.get("given") != null
|
||||
&& !currentAuth.get("given").isJsonNull()) {
|
||||
given = currentAuth.get("given").getAsString();
|
||||
}
|
||||
authorList.add(new ScholixEntityId(String.format("%s %s", family, given), null));
|
||||
}
|
||||
currentObject.setCreator(authorList);
|
||||
}
|
||||
if (message.get("publisher") != null && !message.get("publisher").isJsonNull()) {
|
||||
currentObject
|
||||
.setPublisher(
|
||||
Collections
|
||||
.singletonList(
|
||||
new ScholixEntityId(message.get("publisher").getAsString(), null)));
|
||||
}
|
||||
currentObject.setCollectedFrom(collectedFrom);
|
||||
currentObject.setObjectType("publication");
|
||||
currentObject
|
||||
.setDnetIdentifier(
|
||||
generateId(message.get("DOI").getAsString(), "doi", "publication"));
|
||||
|
||||
return currentObject;
|
||||
}
|
||||
|
||||
private static List<ScholixCollectedFrom> generateCrossrefCollectedFrom(
|
||||
final String completionStatus) {
|
||||
final ScholixEntityId scholixEntityId = new ScholixEntityId(
|
||||
"Crossref",
|
||||
Collections
|
||||
.singletonList(
|
||||
new ScholixIdentifier("dli_________::crossref", "dnet_identifier")));
|
||||
return Collections
|
||||
.singletonList(
|
||||
new ScholixCollectedFrom(scholixEntityId, "resolved", completionStatus));
|
||||
}
|
||||
|
||||
private static String generateId(
|
||||
final String pid, final String pidType, final String entityType) {
|
||||
String type;
|
||||
switch (entityType) {
|
||||
case "publication":
|
||||
type = "50|";
|
||||
break;
|
||||
case "dataset":
|
||||
type = "60|";
|
||||
break;
|
||||
case "unknown":
|
||||
type = "70|";
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("unexpected value " + entityType);
|
||||
}
|
||||
return type
|
||||
+ DHPUtils
|
||||
.md5(
|
||||
String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
|
||||
}
|
||||
}
|
@ -1,90 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.update;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
|
||||
|
||||
public class CrossrefClient {
|
||||
|
||||
private String host;
|
||||
private String index = "crossref";
|
||||
private String indexType = "item";
|
||||
|
||||
public CrossrefClient(String host) {
|
||||
this.host = host;
|
||||
}
|
||||
|
||||
public String getHost() {
|
||||
return host;
|
||||
}
|
||||
|
||||
public void setHost(String host) {
|
||||
this.host = host;
|
||||
}
|
||||
|
||||
public String getIndex() {
|
||||
return index;
|
||||
}
|
||||
|
||||
public void setIndex(String index) {
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
public String getIndexType() {
|
||||
return indexType;
|
||||
}
|
||||
|
||||
public void setIndexType(String indexType) {
|
||||
this.indexType = indexType;
|
||||
}
|
||||
|
||||
private static String decompressBlob(final String blob) {
|
||||
try {
|
||||
byte[] byteArray = Base64.decodeBase64(blob.getBytes());
|
||||
final Inflater decompresser = new Inflater();
|
||||
decompresser.setInput(byteArray);
|
||||
final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
|
||||
byte[] buffer = new byte[8192];
|
||||
while (!decompresser.finished()) {
|
||||
int size = decompresser.inflate(buffer);
|
||||
bos.write(buffer, 0, size);
|
||||
}
|
||||
decompresser.end();
|
||||
return bos.toString();
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("Wrong record:" + blob, e);
|
||||
}
|
||||
}
|
||||
|
||||
public ScholixResource getResourceByDOI(final String doi) {
|
||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
HttpGet httpGet = new HttpGet(
|
||||
String
|
||||
.format(
|
||||
"http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F")));
|
||||
CloseableHttpResponse response = client.execute(httpGet);
|
||||
String json = IOUtils.toString(response.getEntity().getContent());
|
||||
if (json.contains("blob")) {
|
||||
JsonParser p = new JsonParser();
|
||||
final JsonElement root = p.parse(json);
|
||||
json = decompressBlob(
|
||||
root.getAsJsonObject().get("_source").getAsJsonObject().get("blob").getAsString());
|
||||
}
|
||||
return CrossRefParserJSON.parseRecord(json);
|
||||
} catch (Throwable e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,229 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.update;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.*;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import eu.dnetlib.scholexplorer.relation.RelInfo;
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper;
|
||||
|
||||
public class Datacite2Scholix {
|
||||
|
||||
private String rootPath = "$.attributes";
|
||||
final RelationMapper relationMapper;
|
||||
|
||||
public Datacite2Scholix(RelationMapper relationMapper) {
|
||||
this.relationMapper = relationMapper;
|
||||
}
|
||||
|
||||
public List<Scholix> generateScholixFromJson(final String dJson) {
|
||||
List<Map<String, String>> relIds = getRelatedIendtifiers(dJson);
|
||||
relIds = relIds != null
|
||||
? relIds
|
||||
.stream()
|
||||
.filter(
|
||||
m -> m.containsKey("relatedIdentifierType")
|
||||
&& m.containsKey("relationType")
|
||||
&& m.containsKey("relatedIdentifier"))
|
||||
.collect(Collectors.toList())
|
||||
: null;
|
||||
if (relIds == null || relIds.size() == 0)
|
||||
return null;
|
||||
|
||||
final String updated = JsonPath.read(dJson, rootPath + ".updated");
|
||||
ScholixResource resource = generateDataciteScholixResource(dJson);
|
||||
|
||||
return relIds
|
||||
.stream()
|
||||
.flatMap(
|
||||
s -> {
|
||||
try {
|
||||
final List<Scholix> result = generateScholix(
|
||||
resource,
|
||||
"" + s.get("relatedIdentifier"),
|
||||
s.get("relatedIdentifierType"),
|
||||
s.get("relationType"),
|
||||
updated);
|
||||
return result.stream();
|
||||
} catch (Throwable e) {
|
||||
return new ArrayList<Scholix>().stream();
|
||||
}
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public String getRootPath() {
|
||||
return rootPath;
|
||||
}
|
||||
|
||||
public void setRootPath(String rootPath) {
|
||||
this.rootPath = rootPath;
|
||||
}
|
||||
|
||||
private List<Scholix> generateScholix(
|
||||
ScholixResource source,
|
||||
final String pid,
|
||||
final String pidtype,
|
||||
final String relType,
|
||||
final String updated) {
|
||||
|
||||
if ("doi".equalsIgnoreCase(pidtype)) {
|
||||
ScholixResource target = new ScholixResource();
|
||||
target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype)));
|
||||
final RelInfo relInfo = relationMapper.get(relType.toLowerCase());
|
||||
final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite",
|
||||
relInfo.getInverse());
|
||||
final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider();
|
||||
final Scholix s = new Scholix();
|
||||
s.setSource(source);
|
||||
s.setTarget(target);
|
||||
s.setLinkprovider(Collections.singletonList(provider));
|
||||
s.setPublisher(source.getPublisher());
|
||||
s.setRelationship(rel);
|
||||
s.setPublicationDate(updated);
|
||||
return Collections.singletonList(s);
|
||||
} else {
|
||||
final List<Scholix> result = new ArrayList<>();
|
||||
ScholixResource target = new ScholixResource();
|
||||
target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype)));
|
||||
target.setDnetIdentifier(generateId(pid, pidtype, "unknown"));
|
||||
target.setObjectType("unknown");
|
||||
target.setCollectedFrom(generateDataciteCollectedFrom("incomplete"));
|
||||
final RelInfo relInfo = relationMapper.get(relType.toLowerCase());
|
||||
final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite",
|
||||
relInfo.getInverse());
|
||||
final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider();
|
||||
final Scholix s = new Scholix();
|
||||
s.setSource(source);
|
||||
s.setTarget(target);
|
||||
s.setLinkprovider(Collections.singletonList(provider));
|
||||
s.setPublisher(source.getPublisher());
|
||||
s.setRelationship(rel);
|
||||
s.setPublicationDate(updated);
|
||||
s.generateIdentifier();
|
||||
result.add(s);
|
||||
final Scholix s2 = new Scholix();
|
||||
s2.setSource(target);
|
||||
s2.setTarget(source);
|
||||
s2.setLinkprovider(Collections.singletonList(provider));
|
||||
s2.setPublisher(source.getPublisher());
|
||||
s2
|
||||
.setRelationship(
|
||||
new ScholixRelationship(relInfo.getInverse(), "datacite", relInfo.getOriginal()));
|
||||
s2.setPublicationDate(updated);
|
||||
s2.generateIdentifier();
|
||||
result.add(s2);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public ScholixResource generateDataciteScholixResource(String dJson) {
|
||||
ScholixResource resource = new ScholixResource();
|
||||
String DOI_PATH = rootPath + ".doi";
|
||||
final String doi = JsonPath.read(dJson, DOI_PATH);
|
||||
resource.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi")));
|
||||
resource.setObjectType(getType(dJson));
|
||||
resource.setDnetIdentifier(generateId(doi, "doi", resource.getObjectType()));
|
||||
resource.setCollectedFrom(generateDataciteCollectedFrom("complete"));
|
||||
final String publisher = JsonPath.read(dJson, rootPath + ".publisher");
|
||||
if (StringUtils.isNotBlank(publisher))
|
||||
resource.setPublisher(Collections.singletonList(new ScholixEntityId(publisher, null)));
|
||||
final String date = getDate(dJson);
|
||||
if (StringUtils.isNotBlank(date))
|
||||
resource.setPublicationDate(date);
|
||||
final String title = getTitle(dJson);
|
||||
if (StringUtils.isNotBlank(title))
|
||||
resource.setTitle(title);
|
||||
resource.setCreator(getCreators(dJson));
|
||||
return resource;
|
||||
}
|
||||
|
||||
private List<ScholixEntityId> getCreators(final String json) {
|
||||
final List<String> creatorName = JsonPath.read(json, rootPath + ".creators[*].name");
|
||||
if (creatorName != null && creatorName.size() > 0) {
|
||||
return creatorName
|
||||
.stream()
|
||||
.map(s -> new ScholixEntityId(s, null))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String getTitle(final String json) {
|
||||
final List<String> titles = JsonPath.read(json, rootPath + ".titles[*].title");
|
||||
return titles != null && titles.size() > 0 ? titles.get(0) : null;
|
||||
}
|
||||
|
||||
private String getDate(final String json) {
|
||||
final List<Map<String, String>> dates = JsonPath.read(json, rootPath + ".dates");
|
||||
if (dates != null && dates.size() > 0) {
|
||||
|
||||
List<Map<String, String>> issued = dates
|
||||
.stream()
|
||||
.filter(s -> "issued".equalsIgnoreCase(s.get("dateType")))
|
||||
.collect(Collectors.toList());
|
||||
if (issued.size() > 0)
|
||||
return issued.get(0).get("date");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<ScholixCollectedFrom> generateDataciteCollectedFrom(final String completionStatus) {
|
||||
final ScholixEntityId scholixEntityId = new ScholixEntityId(
|
||||
"Datasets in Datacite",
|
||||
Collections
|
||||
.singletonList(
|
||||
new ScholixIdentifier("dli_________::datacite", "dnet_identifier")));
|
||||
return Collections
|
||||
.singletonList(
|
||||
new ScholixCollectedFrom(scholixEntityId, "collected", completionStatus));
|
||||
}
|
||||
|
||||
private String getType(final String json) {
|
||||
try {
|
||||
final String bibtext = JsonPath.read(json, rootPath + ".types.bibtex");
|
||||
if ("article".equalsIgnoreCase(bibtext)) {
|
||||
return "publication";
|
||||
}
|
||||
return "dataset";
|
||||
} catch (Throwable e) {
|
||||
return "dataset";
|
||||
}
|
||||
}
|
||||
|
||||
private List<Map<String, String>> getRelatedIendtifiers(final String json) {
|
||||
String REL_IDENTIFIER_PATH = rootPath + ".relatedIdentifiers[*]";
|
||||
List<Map<String, String>> res = JsonPath.read(json, REL_IDENTIFIER_PATH);
|
||||
return res;
|
||||
}
|
||||
|
||||
public static String generateId(final String pid, final String pidType, final String entityType) {
|
||||
String type;
|
||||
switch (entityType) {
|
||||
case "publication":
|
||||
type = "50|";
|
||||
break;
|
||||
case "dataset":
|
||||
type = "60|";
|
||||
break;
|
||||
case "unknown":
|
||||
type = "70|";
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("unexpected value " + entityType);
|
||||
}
|
||||
return type
|
||||
+ DHPUtils
|
||||
.md5(
|
||||
String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
|
||||
}
|
||||
}
|
@ -1,75 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.update;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
|
||||
|
||||
public class DataciteClient {
|
||||
|
||||
private String host;
|
||||
private String index = "datacite";
|
||||
private String indexType = "dump";
|
||||
private final Datacite2Scholix d2s;
|
||||
|
||||
public DataciteClient(String host) {
|
||||
this.host = host;
|
||||
|
||||
d2s = new Datacite2Scholix(null);
|
||||
d2s.setRootPath("$._source.attributes");
|
||||
}
|
||||
|
||||
public Iterable<String> getDatasetsFromTs(final Long timestamp) {
|
||||
return () -> {
|
||||
try {
|
||||
return new DataciteClientIterator(host, index, timestamp);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public String getHost() {
|
||||
return host;
|
||||
}
|
||||
|
||||
public void setHost(String host) {
|
||||
this.host = host;
|
||||
}
|
||||
|
||||
public String getIndex() {
|
||||
return index;
|
||||
}
|
||||
|
||||
public void setIndex(String index) {
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
public String getIndexType() {
|
||||
return indexType;
|
||||
}
|
||||
|
||||
public void setIndexType(String indexType) {
|
||||
this.indexType = indexType;
|
||||
}
|
||||
|
||||
public ScholixResource getDatasetByDOI(final String doi) {
|
||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
HttpGet httpGet = new HttpGet(
|
||||
String
|
||||
.format(
|
||||
"http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F")));
|
||||
CloseableHttpResponse response = client.execute(httpGet);
|
||||
final String json = IOUtils.toString(response.getEntity().getContent());
|
||||
return d2s.generateDataciteScholixResource(json);
|
||||
} catch (Throwable e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,120 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.update;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpPost;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
import net.minidev.json.JSONArray;
|
||||
|
||||
public class DataciteClientIterator implements Iterator<String> {
|
||||
|
||||
static final String blobPath = "$.hits.hits[*]._source";
|
||||
static final String scrollIdPath = "$._scroll_id";
|
||||
|
||||
String scrollId;
|
||||
|
||||
List<String> buffer;
|
||||
|
||||
final String esHost;
|
||||
final String esIndex;
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
public DataciteClientIterator(final String esHost, final String esIndex, long timestamp)
|
||||
throws IOException {
|
||||
|
||||
this.esHost = esHost;
|
||||
this.esIndex = esIndex;
|
||||
// THIS FIX IS NECESSARY to avoid different timezone
|
||||
timestamp -= (60 * 60 * 2);
|
||||
final String body = getResponse(
|
||||
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
|
||||
String
|
||||
.format(
|
||||
"{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", timestamp));
|
||||
scrollId = getJPathString(scrollIdPath, body);
|
||||
buffer = getBlobs(body);
|
||||
}
|
||||
|
||||
public String getResponse(final String url, final String json) {
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
try {
|
||||
|
||||
HttpPost httpPost = new HttpPost(url);
|
||||
if (json != null) {
|
||||
StringEntity entity = new StringEntity(json);
|
||||
httpPost.setEntity(entity);
|
||||
httpPost.setHeader("Accept", "application/json");
|
||||
httpPost.setHeader("Content-type", "application/json");
|
||||
}
|
||||
CloseableHttpResponse response = client.execute(httpPost);
|
||||
|
||||
return IOUtils.toString(response.getEntity().getContent());
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("Error on executing request ", e);
|
||||
} finally {
|
||||
try {
|
||||
client.close();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to close client ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String getJPathString(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof String)
|
||||
return (String) o;
|
||||
return null;
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> getBlobs(final String body) {
|
||||
JSONArray array = JsonPath.read(body, blobPath);
|
||||
return array
|
||||
.stream()
|
||||
.map(
|
||||
o -> {
|
||||
try {
|
||||
return mapper.writeValueAsString(o);
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return (buffer != null && !buffer.isEmpty());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
final String nextItem = buffer.remove(0);
|
||||
if (buffer.isEmpty()) {
|
||||
final String json_param = String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId);
|
||||
final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
|
||||
try {
|
||||
buffer = getBlobs(body);
|
||||
} catch (Throwable e) {
|
||||
System.out.println(body);
|
||||
}
|
||||
}
|
||||
return nextItem;
|
||||
}
|
||||
}
|
@ -1,72 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.update;
|
||||
|
||||
import java.net.URI;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix;
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper;
|
||||
|
||||
public class RetrieveUpdateFromDatacite {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
RetrieveUpdateFromDatacite.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final String hdfsuri = parser.get("namenode");
|
||||
Path hdfswritepath = new Path(parser.get("targetPath"));
|
||||
final long timestamp = Long.parseLong(parser.get("timestamp"));
|
||||
final String host = parser.get("indexHost");
|
||||
final String index = parser.get("indexName");
|
||||
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
FileSystem.get(URI.create(hdfsuri), conf);
|
||||
final Datacite2Scholix d2s = new Datacite2Scholix(RelationMapper.load());
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfswritepath),
|
||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
final Text value = new Text();
|
||||
final IntWritable key = new IntWritable();
|
||||
int i = 0;
|
||||
for (String dataset : new DataciteClient(host).getDatasetsFromTs(timestamp)) {
|
||||
i++;
|
||||
List<Scholix> scholix = d2s.generateScholixFromJson(dataset);
|
||||
if (scholix != null)
|
||||
for (Scholix s : scholix) {
|
||||
key.set(i);
|
||||
value.set(mapper.writeValueAsString(s));
|
||||
writer.append(key, value);
|
||||
if (i % 10000 == 0) {
|
||||
System.out.println("wrote " + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,184 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision.update;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix;
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier;
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixRelationship;
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkResolveScholixTarget {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkResolveScholixTarget.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String master = parser.get("master");
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
final String workingDirPath = parser.get("workingDirPath");
|
||||
final String indexHost = parser.get("indexHost");
|
||||
try (SparkSession spark = getSession(conf, master)) {
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
spark
|
||||
.createDataset(
|
||||
sc
|
||||
.sequenceFile(sourcePath, IntWritable.class, Text.class)
|
||||
.map(Tuple2::_2)
|
||||
.map(s -> new ObjectMapper().readValue(s.toString(), Scholix.class))
|
||||
.rdd(),
|
||||
Encoders.bean(Scholix.class))
|
||||
.write()
|
||||
.save(workingDirPath + "/stepA");
|
||||
|
||||
Dataset<Scholix> s1 = spark.read().load(workingDirPath + "/stepA").as(Encoders.bean(Scholix.class));
|
||||
|
||||
s1
|
||||
.where(s1.col("target.dnetIdentifier").isNull())
|
||||
.select(s1.col("target.identifier"))
|
||||
.distinct()
|
||||
.map(
|
||||
(MapFunction<Row, ScholixResource>) f -> {
|
||||
final String pid = ((Row) f.getList(0).get(0)).getString(0);
|
||||
ScholixResource publication = new CrossrefClient(indexHost).getResourceByDOI(pid);
|
||||
if (publication != null) {
|
||||
return publication;
|
||||
}
|
||||
ScholixResource dataset = new DataciteClient(indexHost).getDatasetByDOI(pid);
|
||||
if (dataset != null) {
|
||||
return dataset;
|
||||
}
|
||||
ScholixResource r = new ScholixResource();
|
||||
r.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, "doi")));
|
||||
r.setObjectType("unknown");
|
||||
r
|
||||
.setDnetIdentifier(
|
||||
"70|" + DHPUtils.md5(String.format("%s::doi", pid.toLowerCase().trim())));
|
||||
|
||||
return r;
|
||||
},
|
||||
Encoders.bean(ScholixResource.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(workingDirPath + "/stepB");
|
||||
|
||||
Dataset<ScholixResource> s2 = spark
|
||||
.read()
|
||||
.load(workingDirPath + "/stepB")
|
||||
.as(Encoders.bean(ScholixResource.class));
|
||||
|
||||
s1
|
||||
.joinWith(
|
||||
s2,
|
||||
s1.col("target.identifier.identifier").equalTo(s2.col("identifier.identifier")),
|
||||
"left")
|
||||
.flatMap(
|
||||
(FlatMapFunction<Tuple2<Scholix, ScholixResource>, Scholix>) f -> {
|
||||
final List<Scholix> res = new ArrayList<>();
|
||||
final Scholix s = f._1();
|
||||
final ScholixResource target = f._2();
|
||||
if (StringUtils.isNotBlank(s.getIdentifier()))
|
||||
res.add(s);
|
||||
else if (target == null) {
|
||||
ScholixResource currentTarget = s.getTarget();
|
||||
currentTarget.setObjectType("unknown");
|
||||
currentTarget
|
||||
.setDnetIdentifier(
|
||||
Datacite2Scholix
|
||||
.generateId(
|
||||
currentTarget.getIdentifier().get(0).getIdentifier(),
|
||||
currentTarget.getIdentifier().get(0).getSchema(),
|
||||
currentTarget.getObjectType()));
|
||||
|
||||
s.generateIdentifier();
|
||||
res.add(s);
|
||||
final Scholix inverse = new Scholix();
|
||||
inverse.setTarget(s.getSource());
|
||||
inverse.setSource(s.getTarget());
|
||||
inverse.setLinkprovider(s.getLinkprovider());
|
||||
inverse.setPublicationDate(s.getPublicationDate());
|
||||
inverse.setPublisher(s.getPublisher());
|
||||
inverse
|
||||
.setRelationship(
|
||||
new ScholixRelationship(
|
||||
s.getRelationship().getInverse(),
|
||||
s.getRelationship().getSchema(),
|
||||
s.getRelationship().getName()));
|
||||
inverse.generateIdentifier();
|
||||
res.add(inverse);
|
||||
|
||||
} else {
|
||||
target
|
||||
.setIdentifier(
|
||||
target
|
||||
.getIdentifier()
|
||||
.stream()
|
||||
.map(
|
||||
d -> new ScholixIdentifier(
|
||||
d.getIdentifier().toLowerCase(),
|
||||
d.getSchema().toLowerCase()))
|
||||
.collect(Collectors.toList()));
|
||||
s.setTarget(target);
|
||||
s.generateIdentifier();
|
||||
res.add(s);
|
||||
final Scholix inverse = new Scholix();
|
||||
inverse.setTarget(s.getSource());
|
||||
inverse.setSource(s.getTarget());
|
||||
inverse.setLinkprovider(s.getLinkprovider());
|
||||
inverse.setPublicationDate(s.getPublicationDate());
|
||||
inverse.setPublisher(s.getPublisher());
|
||||
inverse
|
||||
.setRelationship(
|
||||
new ScholixRelationship(
|
||||
s.getRelationship().getInverse(),
|
||||
s.getRelationship().getSchema(),
|
||||
s.getRelationship().getName()));
|
||||
inverse.generateIdentifier();
|
||||
res.add(inverse);
|
||||
}
|
||||
|
||||
return res.iterator();
|
||||
},
|
||||
Encoders.bean(Scholix.class))
|
||||
.javaRDD()
|
||||
.map(s -> new ObjectMapper().writeValueAsString(s))
|
||||
.saveAsTextFile(workingDirPath + "/resolved_json");
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession getSession(SparkConf conf, String master) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkResolveScholixTarget.class.getSimpleName())
|
||||
.master(master)
|
||||
.getOrCreate();
|
||||
}
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingDirPath",
|
||||
"paramDescription": "the working path where generated files",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,20 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "nameNode",
|
||||
"paramDescription": "the Name Node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramDescription": "the target path",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,45 +0,0 @@
|
||||
|
||||
[
|
||||
{
|
||||
"paramName":"nd",
|
||||
"paramLongName":"newDeposition",
|
||||
"paramDescription": "if it is a new deposition (true) or a new version (false)",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"cri",
|
||||
"paramLongName":"conceptRecordId",
|
||||
"paramDescription": "The id of the concept record for a new version",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"hdfsp",
|
||||
"paramLongName":"hdfsPath",
|
||||
"paramDescription": "the path of the folder tofind files to send to Zenodo",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "nn",
|
||||
"paramLongName": "nameNode",
|
||||
"paramDescription": "the name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "at",
|
||||
"paramLongName": "accessToken",
|
||||
"paramDescription": "the access token for the deposition",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"cu",
|
||||
"paramLongName":"connectionUrl",
|
||||
"paramDescription": "the url to connect to deposit",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"m",
|
||||
"paramLongName":"metadata",
|
||||
"paramDescription": "metadata associated to the deposition",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
@ -1,4 +0,0 @@
|
||||
{
|
||||
"cluster1": "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54",
|
||||
"cluster2": "10.19.65.55, 10.19.65.56, 10.19.65.57, 10.19.65.58"
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "master should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,14 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "c",
|
||||
"paramLongName": "cluster",
|
||||
"paramDescription": "should be cluster1 or cluster2",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "index",
|
||||
"paramDescription": "index name",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,33 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the working path where generated files",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "index",
|
||||
"paramDescription": "the index name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "c",
|
||||
"paramLongName": "cluster",
|
||||
"paramDescription": "the index cluster",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "id",
|
||||
"paramLongName": "idPath",
|
||||
"paramDescription": "the identifier field name",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,20 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingDirPath",
|
||||
"paramDescription": "the working path where generated files",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "g",
|
||||
"paramLongName": "graphPath",
|
||||
"paramDescription": "the relationPath path ",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,20 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingDirPath",
|
||||
"paramDescription": "the working path where generated files",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "r",
|
||||
"paramLongName": "relationPath",
|
||||
"paramDescription": "the relationPath path ",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,26 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingDirPath",
|
||||
"paramDescription": "the working Dir Path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "h",
|
||||
"paramLongName": "indexHost",
|
||||
"paramDescription": "the working Dir Path",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -1,33 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
"paramDescription": "the name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramDescription": "the working path where generated files",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ts",
|
||||
"paramLongName": "timestamp",
|
||||
"paramDescription": "the timestamp for incremental harvesting",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ih",
|
||||
"paramLongName": "indexHost",
|
||||
"paramDescription": "the ip name of the index",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "in",
|
||||
"paramLongName": "indexName",
|
||||
"paramDescription": "the name of the index",
|
||||
"paramRequired": true
|
||||
}
|
||||
|
||||
]
|
@ -1,331 +0,0 @@
|
||||
{
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"identifier": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"linkprovider": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"identifiers": {
|
||||
"properties": {
|
||||
"identifier": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"name": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"publicationDate": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"relationship": {
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"source": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"collectedFrom": {
|
||||
"properties": {
|
||||
"completionStatus": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"provider": {
|
||||
"properties": {
|
||||
"identifiers": {
|
||||
"properties": {
|
||||
"identifier": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"provisionMode": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"creator": {
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"dnetIdentifier": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"identifier": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"identifier": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"schema": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"type": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"objectType": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"publicationDate": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"publisher": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"title": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"target": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"collectedFrom": {
|
||||
"properties": {
|
||||
"completionStatus": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"provider": {
|
||||
"properties": {
|
||||
"identifiers": {
|
||||
"properties": {
|
||||
"identifier": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"provisionMode": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"creator": {
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"dnetIdentifier": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"identifier": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"identifier": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"schema": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"type": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"objectType": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"publicationDate": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"publisher": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"title": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"settings": {
|
||||
"index": {
|
||||
"refresh_interval": "600s",
|
||||
"number_of_shards": "48",
|
||||
"translog": {
|
||||
"sync_interval": "15s",
|
||||
"durability": "ASYNC"
|
||||
},
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"analyzer_keyword": {
|
||||
"filter": "lowercase",
|
||||
"tokenizer": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"number_of_replicas": "0"
|
||||
}
|
||||
}
|
||||
}
|
@ -1,132 +0,0 @@
|
||||
{
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"abstract": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"author": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"datasources": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"completionStatus": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"datasourceId": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"datasourceName": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"date": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"id": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"localIdentifier": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"type": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"publisher": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"relatedDatasets": {
|
||||
"type": "long"
|
||||
},
|
||||
"relatedPublications": {
|
||||
"type": "long"
|
||||
},
|
||||
"relatedUnknown": {
|
||||
"type": "long"
|
||||
},
|
||||
"subject": {
|
||||
"properties": {
|
||||
"scheme": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"value": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"title": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"typology": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"settings": {
|
||||
"index": {
|
||||
"refresh_interval": "600s",
|
||||
"number_of_shards": "48",
|
||||
"translog": {
|
||||
"sync_interval": "15s",
|
||||
"durability": "ASYNC"
|
||||
},
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"analyzer_keyword": {
|
||||
"filter": "lowercase",
|
||||
"tokenizer": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"number_of_replicas": "0"
|
||||
}
|
||||
}
|
||||
}
|
@ -1,42 +0,0 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.rerun.failnodes</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,49 +0,0 @@
|
||||
<workflow-app name="Export Scholexplorer Graph to OpenAIRE" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingDirPath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ExtractOAF"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ExtractOAF">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractOAF</name>
|
||||
<class>eu.dnetlib.dhp.export.SparkExportContentForOpenAire</class>
|
||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,14 +0,0 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,86 +0,0 @@
|
||||
<workflow-app name="Materialize and Index graph to ElasticSearch" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingDirPath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>index</name>
|
||||
<description>the index name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esCluster</name>
|
||||
<description>the Index cluster</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="DropAndCreateIndex"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="DropAndCreateIndex">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.provision.DropAndCreateESIndex</main-class>
|
||||
<arg>-i</arg><arg>${index}</arg>
|
||||
<arg>-c</arg><arg>${esCluster}</arg>
|
||||
</java>
|
||||
<ok to="indexSummary"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="indexSummary">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>index summary</name>
|
||||
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingDirPath}/summary_json</arg>
|
||||
<arg>--index</arg><arg>${index}_object</arg>
|
||||
<arg>--idPath</arg><arg>id</arg>
|
||||
<arg>--cluster</arg><arg>${esCluster}</arg>
|
||||
</spark>
|
||||
<ok to="indexScholix"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="indexScholix">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>index scholix</name>
|
||||
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
|
||||
<arg>--index</arg><arg>${index}_scholix</arg>
|
||||
<arg>--idPath</arg><arg>identifier</arg>
|
||||
<arg>--cluster</arg><arg>${esCluster}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,14 +0,0 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,116 +0,0 @@
|
||||
<workflow-app name="Materialize and Index graph to ElasticSearch" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingDirPath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>graphPath</name>
|
||||
<description>the graph path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>index</name>
|
||||
<description>the index name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esCluster</name>
|
||||
<description>the Index cluster</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="DeleteTargetPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<action name="DeleteTargetPath">
|
||||
<fs>
|
||||
<delete path='${workingDirPath}'/>
|
||||
<mkdir path='${workingDirPath}'/>
|
||||
</fs>
|
||||
<ok to="CalculateRelatedItem"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="CalculateRelatedItem">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>calculate for each ID the number of related Dataset, publication and Unknown</name>
|
||||
<class>eu.dnetlib.dhp.provision.SparkExtractRelationCount</class>
|
||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||
<arg>--relationPath</arg><arg>${graphPath}/relation</arg>
|
||||
</spark>
|
||||
<ok to="generateSummary"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="generateSummary">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>generate Summary</name>
|
||||
<class>eu.dnetlib.dhp.provision.SparkGenerateSummaryIndex</class>
|
||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.sql.shuffle.partitions=4000 ${sparkExtraOPT}</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
||||
</spark>
|
||||
<ok to="generateScholix"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="generateScholix">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>generate Scholix</name>
|
||||
<class>eu.dnetlib.dhp.provision.SparkGenerateScholixIndex</class>
|
||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.sql.shuffle.partitions=4000 ${sparkExtraOPT}</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
||||
</spark>
|
||||
<ok to="datasetToJson"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="datasetToJson">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>generate Scholix</name>
|
||||
<class>eu.dnetlib.dhp.provision.SparkConvertDatasetToJson</class>
|
||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.sql.shuffle.partitions=4000 ${sparkExtraOPT}</spark-opts>
|
||||
<arg>-m</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDirPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,14 +0,0 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,97 +0,0 @@
|
||||
<workflow-app name="Keep On Synch datacite" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingDirPath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>index</name>
|
||||
<description>index name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>timestamp</name>
|
||||
<description>timestamp from incremental harvesting</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingDirPath}/synch'/>
|
||||
<mkdir path='${workingDirPath}/synch'/>
|
||||
</fs>
|
||||
<ok to="ImportDataciteUpdate"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ImportDataciteUpdate">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.provision.update.RetrieveUpdateFromDatacite</main-class>
|
||||
<arg>-t</arg><arg>${workingDirPath}/synch/input_json</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-ts</arg><arg>${timestamp}</arg>
|
||||
<arg>-ih</arg><arg>ip-90-147-167-25.ct1.garrservices.it</arg>
|
||||
<arg>-in</arg><arg>datacite</arg>
|
||||
</java>
|
||||
<ok to="resolveScholix"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="resolveScholix">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>resolve and generate Scholix</name>
|
||||
<class>eu.dnetlib.dhp.provision.update.SparkResolveScholixTarget</class>
|
||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="32" </spark-opts>
|
||||
<arg>-m</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${workingDirPath}/synch/input_json</arg>
|
||||
<arg>-w</arg><arg>${workingDirPath}/synch</arg>
|
||||
<arg>-h</arg><arg>ip-90-147-167-25.ct1.garrservices.it</arg>
|
||||
</spark>
|
||||
<ok to="indexScholix"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="indexScholix">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>index scholix</name>
|
||||
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingDirPath}/synch/resolved_json</arg>
|
||||
<arg>--index</arg><arg>${index}_scholix</arg>
|
||||
<arg>--idPath</arg><arg>identifier</arg>
|
||||
<arg>--type</arg><arg>scholix</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,48 +0,0 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.rerun.failnodes</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
@ -1,53 +0,0 @@
|
||||
<workflow-app name="Send Dump to Zenodo" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the target path</description>
|
||||
</property>
|
||||
<!-- <property>-->
|
||||
<!-- <name>metadata</name>-->
|
||||
<!-- <description>the metadata</description>-->
|
||||
<!-- </property>-->
|
||||
</parameters>
|
||||
|
||||
<start to="send_zenodo"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="MakeTar">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.export.zenodo.MakeTar</main-class>
|
||||
<arg>-t</arg><arg>${targetPath}</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-s</arg><arg>${sourcePath}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<!-- <action name="send_zenodo">-->
|
||||
<!-- <java>-->
|
||||
<!-- <main-class>eu.dnetlib.dhp.export.zenodo.SendToZenodoHDFS</main-class>-->
|
||||
<!-- <arg>--hdfsPath</arg><arg>/user/dnet.scholexplorer/scholix/provision/scholix.tar/scholix-2020-10-16.tar</arg>-->
|
||||
<!-- <arg>--nameNode</arg><arg>${nameNode}</arg>-->
|
||||
<!-- <arg>--accessToken</arg><arg>b6ddrY6b77WxcDEevn9gqVE5sL5sDNjdUijt75W3o7cQo5vpFFI48dMiu8Gv</arg>-->
|
||||
<!-- <arg>--connectionUrl</arg><arg>https://zenodo.org/api/deposit/depositions</arg>-->
|
||||
<!-- <arg>--metadata</arg><arg>${metadata}</arg>-->
|
||||
<!-- <arg>--conceptRecordId</arg><arg>1200252</arg>-->
|
||||
<!-- <arg>--newDeposition</arg><arg>false</arg>-->
|
||||
<!-- </java>-->
|
||||
<!-- <ok to="End"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,102 +0,0 @@
|
||||
package eu.dnetlib.dhp.export
|
||||
|
||||
import java.time.LocalDateTime
|
||||
import java.time.format.DateTimeFormatter
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
|
||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
import scala.io.Source
|
||||
import scala.collection.JavaConverters._
|
||||
class ExportDLITOOAFTest {
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
@Test
|
||||
def testDate():Unit = {
|
||||
println(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
|
||||
|
||||
}
|
||||
|
||||
|
||||
def extractDatasources(s:Scholix):List[String]= {
|
||||
s.getTarget.getCollectedFrom.asScala.map(c => c.getProvider.getName)(collection.breakOut)
|
||||
}
|
||||
|
||||
|
||||
def extractDatasources(s:ScholixSummary):List[String] = {
|
||||
|
||||
s.getDatasources.asScala.map(c => c.getDatasourceName)(collection.breakOut)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testMappingRele():Unit = {
|
||||
|
||||
val r:Relation = new Relation
|
||||
r.setSource("60|fbff1d424e045eecf24151a5fe3aa738")
|
||||
r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877")
|
||||
r.setRelType("IsReferencedBy")
|
||||
|
||||
|
||||
val r1 =DLIToOAF.convertDLIRelation(r)
|
||||
println(r1.getSource, r1.getTarget)
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testPublicationMapping():Unit = {
|
||||
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("publication.json")).mkString
|
||||
|
||||
|
||||
val oaf =DLIToOAF.convertDLIPublicationToOAF(mapper.readValue(json, classOf[DLIPublication]))
|
||||
|
||||
println(mapper.writeValueAsString(oaf))
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testExternalReferenceMapping():Unit = {
|
||||
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("dataset.json")).mkString
|
||||
|
||||
|
||||
val oaf =DLIToOAF.convertDLIDatasetToExternalReference(mapper.readValue(json, classOf[DLIDataset]))
|
||||
|
||||
println(oaf)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testRelationMapping():Unit = {
|
||||
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("relation.json")).mkString
|
||||
|
||||
|
||||
val oaf =mapper.readValue(json, classOf[Relation])
|
||||
|
||||
println(mapper.writeValueAsString(oaf))
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,50 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix;
|
||||
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
|
||||
import eu.dnetlib.dhp.provision.update.*;
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper;
|
||||
|
||||
public class DataciteClientTest {
|
||||
@Test
|
||||
public void dataciteSCholixTest() throws Exception {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream("datacite.json"));
|
||||
final RelationMapper mapper = RelationMapper.load();
|
||||
|
||||
Datacite2Scholix ds = new Datacite2Scholix(mapper);
|
||||
final List<Scholix> s = ds.generateScholixFromJson(json);
|
||||
System.out.println(new ObjectMapper().writeValueAsString(s));
|
||||
}
|
||||
|
||||
// public void testS() throws Exception {
|
||||
// RetrieveUpdateFromDatacite.main(new String[]{
|
||||
// "-n", "file:///data/new_s2.txt",
|
||||
// "-t", "/data/new_s2.txt",
|
||||
// "-ts", "1586974078",
|
||||
// "-ih", "ip-90-147-167-25.ct1.garrservices.it",
|
||||
// "-in", "datacite",
|
||||
// });
|
||||
//
|
||||
// }
|
||||
|
||||
public void testResolveDataset() throws Exception {
|
||||
DataciteClient dc = new DataciteClient("ip-90-147-167-25.ct1.garrservices.it");
|
||||
ScholixResource datasetByDOI = dc.getDatasetByDOI("10.17182/hepdata.15392.v1/t5");
|
||||
Assertions.assertNotNull(datasetByDOI);
|
||||
System.out.println(new ObjectMapper().writeValueAsString(datasetByDOI));
|
||||
|
||||
CrossrefClient cr = new CrossrefClient("ip-90-147-167-25.ct1.garrservices.it");
|
||||
ScholixResource crossrefByDOI = cr.getResourceByDOI("10.26850/1678-4618eqj.v35.1.2010.p41-46");
|
||||
Assertions.assertNotNull(crossrefByDOI);
|
||||
System.out.println(new ObjectMapper().writeValueAsString(crossrefByDOI));
|
||||
}
|
||||
}
|
@ -1,13 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class DropAndCreateESIndexTest {
|
||||
|
||||
public void testDropAndCreate() throws Exception {
|
||||
DropAndCreateESIndex.main("-c localhost -i dli_shadow".split(" "));
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.provision;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix;
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||
|
||||
public class ExtractInfoTest {
|
||||
@Test
|
||||
public void testSerialization() throws Exception {
|
||||
|
||||
ScholixSummary summary = new ScholixSummary();
|
||||
summary.setDescription("descrizione");
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
String json = mapper.writeValueAsString(summary);
|
||||
System.out.println(json);
|
||||
System.out.println(mapper.readValue(json, ScholixSummary.class).getDescription());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScholix() throws Exception {
|
||||
final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json"));
|
||||
final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json"));
|
||||
Scholix.generateScholixWithSource(jsonSummary, jsonRelation);
|
||||
}
|
||||
}
|
@ -1,101 +0,0 @@
|
||||
{
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": null,
|
||||
"deletedbyinference": false,
|
||||
"trust": "0.9",
|
||||
"inferenceprovenance": null,
|
||||
"provenanceaction": null
|
||||
},
|
||||
"lastupdatetimestamp": null,
|
||||
"id": "60|719f19e5a996de1b87cddf93871bf2d4",
|
||||
"originalId": [
|
||||
"a0a3p2gws9::uniprot"
|
||||
],
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "dli_________::europe_pmc__",
|
||||
"value": "Europe PMC",
|
||||
"dataInfo": null
|
||||
}
|
||||
],
|
||||
"pid": [
|
||||
{
|
||||
"value": "acc63471",
|
||||
"qualifier": {
|
||||
"classid": "ena",
|
||||
"classname": "ena",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"dataInfo": null
|
||||
}
|
||||
],
|
||||
"dateofcollection": "2019-07-05T12:47:11.545+02:00",
|
||||
"dateoftransformation": null,
|
||||
"extraInfo": null,
|
||||
"oaiprovenance": null,
|
||||
"author": null,
|
||||
"resulttype": {
|
||||
"classid": "dataset",
|
||||
"classname": "dataset",
|
||||
"schemeid": "dataset",
|
||||
"schemename": "dataset"
|
||||
},
|
||||
"language": null,
|
||||
"country": null,
|
||||
"subject": [],
|
||||
"title": [
|
||||
{
|
||||
"value": "CMD domain-containing protein",
|
||||
"qualifier": null,
|
||||
"dataInfo": null
|
||||
}
|
||||
],
|
||||
"relevantdate": [
|
||||
{
|
||||
"value": "2019-07-15T16:14:28.636",
|
||||
"qualifier": {
|
||||
"classid": "resolvedDate",
|
||||
"classname": "resolvedDate",
|
||||
"schemeid": "dnet::date",
|
||||
"schemename": "dnet::date"
|
||||
},
|
||||
"dataInfo": null
|
||||
}
|
||||
],
|
||||
"description": null,
|
||||
"dateofacceptance": null,
|
||||
"publisher": {
|
||||
"value": "UniProt",
|
||||
"dataInfo": null
|
||||
},
|
||||
"embargoenddate": null,
|
||||
"source": null,
|
||||
"fulltext": null,
|
||||
"format": null,
|
||||
"contributor": null,
|
||||
"resourcetype": null,
|
||||
"coverage": null,
|
||||
"bestaccessright": null,
|
||||
"context": null,
|
||||
"externalReference": null,
|
||||
"instance": [],
|
||||
"storagedate": null,
|
||||
"device": null,
|
||||
"size": null,
|
||||
"version": null,
|
||||
"lastmetadataupdate": null,
|
||||
"metadataversionnumber": null,
|
||||
"geolocation": null,
|
||||
"originalObjIdentifier": "europe_pmc__::719f19e5a996de1b87cddf93871bf2d4",
|
||||
"dlicollectedfrom": [
|
||||
{
|
||||
"id": "dli_________::europe_pmc__",
|
||||
"name": "Europe PMC",
|
||||
"completionStatus": "complete",
|
||||
"collectionMode": null
|
||||
}
|
||||
],
|
||||
"completionStatus": "complete"
|
||||
}
|
@ -1,128 +0,0 @@
|
||||
{
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": null,
|
||||
"deletedbyinference": false,
|
||||
"trust": "0.9",
|
||||
"inferenceprovenance": null,
|
||||
"provenanceaction": null
|
||||
},
|
||||
"lastupdatetimestamp": null,
|
||||
"id": "50|9e117414be07bf03cbce8889d22d661a",
|
||||
"originalId": [
|
||||
"9e117414be07bf03cbce8889d22d661a"
|
||||
],
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "dli_________::crossref",
|
||||
"value": "Crossref",
|
||||
"dataInfo": null
|
||||
}
|
||||
],
|
||||
"pid": [
|
||||
{
|
||||
"value": "10.1007/978-94-017-3490-5_15",
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "doi",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"dataInfo": null
|
||||
}
|
||||
],
|
||||
"dateofcollection": "2020-06-08T07:28:55.731Z",
|
||||
"dateoftransformation": null,
|
||||
"extraInfo": null,
|
||||
"oaiprovenance": null,
|
||||
"author": [
|
||||
{
|
||||
"fullname": "Calcaterra Domenico",
|
||||
"name": null,
|
||||
"surname": null,
|
||||
"rank": null,
|
||||
"pid": null,
|
||||
"affiliation": null
|
||||
},
|
||||
{
|
||||
"fullname": "Parise Mario",
|
||||
"name": null,
|
||||
"surname": null,
|
||||
"rank": null,
|
||||
"pid": null,
|
||||
"affiliation": null
|
||||
}
|
||||
],
|
||||
"resulttype": {
|
||||
"classid": "publication",
|
||||
"classname": "publication",
|
||||
"schemeid": "publication",
|
||||
"schemename": "publication"
|
||||
},
|
||||
"language": null,
|
||||
"country": null,
|
||||
"subject":[
|
||||
{
|
||||
"value":"Strain-linked information about bacterial and archaeal biodiversity",
|
||||
"qualifier":{
|
||||
"classid":"dnet:subject",
|
||||
"classname":"dnet:subject",
|
||||
"schemeid":"",
|
||||
"schemename":""
|
||||
},
|
||||
"dataInfo":null
|
||||
}
|
||||
],
|
||||
"title": [
|
||||
{
|
||||
"value": "The Contribution of Historical Information in the Assessment of Landslide Hazard",
|
||||
"qualifier": null,
|
||||
"dataInfo": null
|
||||
}
|
||||
],
|
||||
"relevantdate": [
|
||||
{
|
||||
"value": "2013-01-29T16:50:44Z",
|
||||
"qualifier": {
|
||||
"classid": "date",
|
||||
"classname": "date",
|
||||
"schemeid": "dnet::date",
|
||||
"schemename": "dnet::date"
|
||||
},
|
||||
"dataInfo": null
|
||||
}
|
||||
],
|
||||
"description": [
|
||||
{
|
||||
"value": null,
|
||||
"dataInfo": null
|
||||
}
|
||||
],
|
||||
"dateofacceptance": null,
|
||||
"publisher": {
|
||||
"value": "Springer Netherlands",
|
||||
"dataInfo": null
|
||||
},
|
||||
"embargoenddate": null,
|
||||
"source": null,
|
||||
"fulltext": null,
|
||||
"format": null,
|
||||
"contributor": null,
|
||||
"resourcetype": null,
|
||||
"coverage": null,
|
||||
"bestaccessright": null,
|
||||
"context": null,
|
||||
"externalReference": null,
|
||||
"instance": [],
|
||||
"journal": null,
|
||||
"originalObjIdentifier": "dli_resolver::9e117414be07bf03cbce8889d22d661a",
|
||||
"dlicollectedfrom": [
|
||||
{
|
||||
"id": "dli_________::crossref",
|
||||
"name": "Crossref",
|
||||
"completionStatus": "complete",
|
||||
"collectionMode": "resolved"
|
||||
}
|
||||
],
|
||||
"completionStatus": "complete"
|
||||
}
|
@ -1,23 +0,0 @@
|
||||
{
|
||||
"subRelType": null,
|
||||
"relClass": "datacite",
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"provenanceaction": null,
|
||||
"inferred": null,
|
||||
"inferenceprovenance": null,
|
||||
"invisible": false,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"target": "50|00062410e2a15322480277d063c181bb",
|
||||
"lastupdatetimestamp": null,
|
||||
"relType": "IsReferencedBy",
|
||||
"source": "60|4ee78ab329b49416b45c3774c132f244",
|
||||
"collectedfrom": [
|
||||
{
|
||||
"dataInfo": null,
|
||||
"value": "Europe PMC",
|
||||
"key": "dli_________::europe_pmc__"
|
||||
}
|
||||
]
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
||||
{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"references","subRelType":null,"relClass":"datacite","source":"50|f2123fce7e56c73dc8f1bf64ec59b477","target":"50|b618cbe39ba940a29993ac324e5f9621","collectedFrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}]}
|
@ -1 +0,0 @@
|
||||
{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"IsReferencedBy","subRelType":null,"relClass":"datacite","source":"50|dedup_______::4f00e4f0e82bb4cbb35261478e55568e","target":"60|97519e00ee2cddfa1f5bcb5220429b8f","collectedfrom":[{"key":"dli_________::europe_pmc__","value":"Europe PMC","dataInfo":null}]}
|
Loading…
Reference in New Issue