forked from D-Net/dnet-hadoop
Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta
This commit is contained in:
commit
33bb79459e
|
@ -57,6 +57,11 @@
|
||||||
<artifactId>commons-io</artifactId>
|
<artifactId>commons-io</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-validator</groupId>
|
||||||
|
<artifactId>commons-validator</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
|
|
@ -10,10 +10,16 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||||
|
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.commons.validator.routines.UrlValidator;
|
||||||
import org.dom4j.*;
|
import org.dom4j.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
@ -50,6 +56,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected static final Map<String, String> nsContext = new HashMap<>();
|
protected static final Map<String, String> nsContext = new HashMap<>();
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(AbstractMdRecordToOafMapper.class);
|
||||||
|
|
||||||
static {
|
static {
|
||||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||||
|
@ -76,40 +84,44 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
this.forceOriginalId = false;
|
this.forceOriginalId = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Oaf> processMdRecord(final String xml) throws DocumentException {
|
public List<Oaf> processMdRecord(final String xml) {
|
||||||
|
|
||||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||||
|
try {
|
||||||
|
final Document doc = DocumentHelper
|
||||||
|
.parseText(
|
||||||
|
xml
|
||||||
|
.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)
|
||||||
|
.replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3)
|
||||||
|
.replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3));
|
||||||
|
|
||||||
final Document doc = DocumentHelper
|
final KeyValue collectedFrom = getProvenanceDatasource(
|
||||||
.parseText(
|
doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");
|
||||||
xml
|
|
||||||
.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)
|
|
||||||
.replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3)
|
|
||||||
.replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3));
|
|
||||||
|
|
||||||
final KeyValue collectedFrom = getProvenanceDatasource(
|
if (collectedFrom == null) {
|
||||||
doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");
|
return Lists.newArrayList();
|
||||||
|
}
|
||||||
|
|
||||||
if (collectedFrom == null) {
|
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
|
||||||
|
? collectedFrom
|
||||||
|
: getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name");
|
||||||
|
|
||||||
|
if (hostedBy == null) {
|
||||||
|
return Lists.newArrayList();
|
||||||
|
}
|
||||||
|
|
||||||
|
final DataInfo info = prepareDataInfo(doc, invisible);
|
||||||
|
final long lastUpdateTimestamp = new Date().getTime();
|
||||||
|
|
||||||
|
final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
|
||||||
|
|
||||||
|
final String type = getResultType(doc, instances);
|
||||||
|
|
||||||
|
return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||||
|
} catch (DocumentException e) {
|
||||||
|
log.error("Error with record:\n" + xml);
|
||||||
return Lists.newArrayList();
|
return Lists.newArrayList();
|
||||||
}
|
}
|
||||||
|
|
||||||
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
|
|
||||||
? collectedFrom
|
|
||||||
: getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name");
|
|
||||||
|
|
||||||
if (hostedBy == null) {
|
|
||||||
return Lists.newArrayList();
|
|
||||||
}
|
|
||||||
|
|
||||||
final DataInfo info = prepareDataInfo(doc, invisible);
|
|
||||||
final long lastUpdateTimestamp = new Date().getTime();
|
|
||||||
|
|
||||||
final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
|
|
||||||
|
|
||||||
final String type = getResultType(doc, instances);
|
|
||||||
|
|
||||||
return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getResultType(final Document doc, final List<Instance> instances) {
|
protected String getResultType(final Document doc, final List<Instance> instances) {
|
||||||
|
@ -609,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected Set<String> validateUrl(Collection<String> url) {
|
||||||
|
UrlValidator urlValidator = UrlValidator.getInstance();
|
||||||
|
if (Objects.isNull(url)) {
|
||||||
|
return new HashSet<>();
|
||||||
|
}
|
||||||
|
return url
|
||||||
|
.stream()
|
||||||
|
.filter(u -> urlValidator.isValid(u))
|
||||||
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,9 @@ import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.rdd.RDD;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -127,8 +130,8 @@ public class GenerateEntitiesApplication {
|
||||||
.sequenceFile(sp, Text.class, Text.class)
|
.sequenceFile(sp, Text.class, Text.class)
|
||||||
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
||||||
.map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs))
|
.map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs))
|
||||||
.filter(Objects::nonNull)
|
.flatMap(List::iterator)
|
||||||
.flatMap(List::iterator));
|
.filter(Objects::nonNull));
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
|
@ -155,11 +158,11 @@ public class GenerateEntitiesApplication {
|
||||||
.saveAsTextFile(targetPath, GzipCodec.class);
|
.saveAsTextFile(targetPath, GzipCodec.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<Oaf> convertToListOaf(
|
public static List<Oaf> convertToListOaf(
|
||||||
final String id,
|
final String id,
|
||||||
final String s,
|
final String s,
|
||||||
final boolean shouldHashId,
|
final boolean shouldHashId,
|
||||||
final VocabularyGroup vocs) throws DocumentException {
|
final VocabularyGroup vocs) {
|
||||||
final String type = StringUtils.substringAfter(id, ":");
|
final String type = StringUtils.substringAfter(id, ":");
|
||||||
|
|
||||||
switch (type.toLowerCase()) {
|
switch (type.toLowerCase()) {
|
||||||
|
@ -200,8 +203,7 @@ public class GenerateEntitiesApplication {
|
||||||
try {
|
try {
|
||||||
return OBJECT_MAPPER.readValue(s, clazz);
|
return OBJECT_MAPPER.readValue(s, clazz);
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
log.error("Error parsing object of class: {}", clazz);
|
log.error("Error parsing object of class: {}:\n{}", clazz, s);
|
||||||
log.error(s);
|
|
||||||
throw new IllegalArgumentException(e);
|
throw new IllegalArgumentException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -159,22 +159,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||||
|
|
||||||
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
||||||
instance
|
final List<String> url = nodes
|
||||||
.setUrl(
|
.stream()
|
||||||
nodes
|
.filter(n -> StringUtils.isNotBlank(n.getText()))
|
||||||
.stream()
|
.map(n -> n.getText().trim())
|
||||||
.filter(n -> StringUtils.isNotBlank(n.getText()))
|
.filter(u -> u.startsWith("http"))
|
||||||
.map(n -> n.getText().trim())
|
.map(s -> {
|
||||||
.filter(u -> u.startsWith("http"))
|
try {
|
||||||
.map(s -> {
|
return URLDecoder.decode(s, "UTF-8");
|
||||||
try {
|
} catch (Throwable t) {
|
||||||
return URLDecoder.decode(s, "UTF-8");
|
return s;
|
||||||
} catch (Throwable t) {
|
}
|
||||||
return s;
|
})
|
||||||
}
|
.distinct()
|
||||||
})
|
.collect(Collectors.toCollection(ArrayList::new));
|
||||||
.distinct()
|
final Set<String> validUrl = validateUrl(url);
|
||||||
.collect(Collectors.toCollection(ArrayList::new)));
|
if (!validUrl.isEmpty()) {
|
||||||
|
instance.setUrl(new ArrayList<>());
|
||||||
|
instance.getUrl().addAll(validUrl);
|
||||||
|
}
|
||||||
|
|
||||||
return Lists.newArrayList(instance);
|
return Lists.newArrayList(instance);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||||
|
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.net.URLDecoder;
|
import java.net.URLDecoder;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.commons.validator.routines.UrlValidator;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.Element;
|
import org.dom4j.Element;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
|
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
|
||||||
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
||||||
}
|
}
|
||||||
for (final Object o : doc
|
|
||||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
|
Set<String> validUrl = validateUrl(url);
|
||||||
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
|
||||||
|
if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) {
|
||||||
|
for (final Object o : doc
|
||||||
|
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
|
||||||
|
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||||
|
}
|
||||||
|
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
|
||||||
|
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
|
if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) {
|
||||||
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
for (final Object o : doc
|
||||||
|
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
|
||||||
|
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||||
|
}
|
||||||
|
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
|
||||||
|
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (final Object o : doc
|
|
||||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
|
if (!validUrl.isEmpty()) {
|
||||||
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
|
||||||
}
|
|
||||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
|
|
||||||
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
|
||||||
}
|
|
||||||
if (!url.isEmpty()) {
|
|
||||||
instance.setUrl(new ArrayList<>());
|
instance.setUrl(new ArrayList<>());
|
||||||
instance.getUrl().addAll(url);
|
instance.getUrl().addAll(validUrl);
|
||||||
}
|
}
|
||||||
return Arrays.asList(instance);
|
return Arrays.asList(instance);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,108 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.rdd.RDD;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class VerifyRecordsApplication {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(VerifyRecordsApplication.class);
|
||||||
|
|
||||||
|
public static void main(final String[] args) throws Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
VerifyRecordsApplication.class
|
||||||
|
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json")));
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
final Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
final String sourcePaths = parser.get("sourcePaths");
|
||||||
|
log.info("sourcePaths: {}", sourcePaths);
|
||||||
|
|
||||||
|
final String invalidPath = parser.get("invalidPath");
|
||||||
|
log.info("invalidPath: {}", invalidPath);
|
||||||
|
|
||||||
|
final String isLookupUrl = parser.get("isLookupUrl");
|
||||||
|
log.info("isLookupUrl: {}", isLookupUrl);
|
||||||
|
|
||||||
|
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||||
|
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
|
||||||
|
|
||||||
|
final SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
|
HdfsSupport.remove(invalidPath, spark.sparkContext().hadoopConfiguration());
|
||||||
|
validateRecords(spark, sourcePaths, invalidPath, vocs);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void validateRecords(SparkSession spark, String sourcePaths, String invalidPath,
|
||||||
|
VocabularyGroup vocs) {
|
||||||
|
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
final List<String> existingSourcePaths = Arrays
|
||||||
|
.stream(sourcePaths.split(","))
|
||||||
|
.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
log.info("Verify records in files:");
|
||||||
|
existingSourcePaths.forEach(log::info);
|
||||||
|
|
||||||
|
for (final String sp : existingSourcePaths) {
|
||||||
|
RDD<String> invalidRecords = sc
|
||||||
|
.sequenceFile(sp, Text.class, Text.class)
|
||||||
|
.map(k -> tryApplyMapping(k._1().toString(), k._2().toString(), true, vocs))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.rdd();
|
||||||
|
spark
|
||||||
|
.createDataset(invalidRecords, Encoders.STRING())
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.text(invalidPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String tryApplyMapping(
|
||||||
|
final String id,
|
||||||
|
final String xmlRecord,
|
||||||
|
final boolean shouldHashId,
|
||||||
|
final VocabularyGroup vocs) {
|
||||||
|
|
||||||
|
final List<Oaf> oaf = GenerateEntitiesApplication.convertToListOaf(id, xmlRecord, shouldHashId, vocs);
|
||||||
|
if (Optional.ofNullable(oaf).map(List::isEmpty).orElse(false)) {
|
||||||
|
return xmlRecord;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -24,8 +25,11 @@ import org.apache.http.impl.client.HttpClients;
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.OafToOafMapper;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.OdfToOafMapper;
|
||||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
public class AbstractMigrationApplication implements Closeable {
|
public class AbstractMigrationApplication implements Closeable {
|
||||||
|
|
|
@ -446,10 +446,34 @@
|
||||||
<join name="wait_import" to="fork_generate_entities"/>
|
<join name="wait_import" to="fork_generate_entities"/>
|
||||||
|
|
||||||
<fork name="fork_generate_entities">
|
<fork name="fork_generate_entities">
|
||||||
<path start="GenerateEntities_claim"/>
|
<path start="VerifyRecords_claim"/>
|
||||||
<path start="GenerateEntities"/>
|
<path start="VerifyRecords"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
|
<action name="VerifyRecords_claim">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>VerifyRecords_claim</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory ${sparkExecutorMemory}
|
||||||
|
--executor-cores ${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
|
||||||
|
<arg>--invalidPath</arg><arg>${workingDir}/invalid_records_claim</arg>
|
||||||
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="GenerateEntities_claim"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<action name="GenerateEntities_claim">
|
<action name="GenerateEntities_claim">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -499,6 +523,30 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="VerifyRecords">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>VerifyRecords</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory ${sparkExecutorMemory}
|
||||||
|
--executor-cores ${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePaths</arg><arg>${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible</arg>
|
||||||
|
<arg>--invalidPath</arg><arg>${workingDir}/invalid_records</arg>
|
||||||
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="GenerateEntities"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<action name="GenerateEntities">
|
<action name="GenerateEntities">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "issm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "when true will stop SparkSession after job execution",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePaths",
|
||||||
|
"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "i",
|
||||||
|
"paramLongName": "invalidPath",
|
||||||
|
"paramDescription": "the path of the invalid records file",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "isu",
|
||||||
|
"paramLongName": "isLookupUrl",
|
||||||
|
"paramDescription": "the url of the ISLookupService",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -21,7 +21,6 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
@ -948,6 +947,15 @@ class MappersTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testNotWellFormed() throws IOException {
|
||||||
|
final String xml = IOUtils
|
||||||
|
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
|
||||||
|
final List<Oaf> actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||||
|
assertNotNull(actual);
|
||||||
|
assertTrue(actual.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
private void assertValidId(final String id) {
|
private void assertValidId(final String id) {
|
||||||
// System.out.println(id);
|
// System.out.println(id);
|
||||||
|
|
||||||
|
|
|
@ -251,6 +251,18 @@ class MigrateDbEntitiesApplicationTest {
|
||||||
assertValidId(r2.getSource());
|
assertValidId(r2.getSource());
|
||||||
assertEquals(r1.getSource(), r2.getTarget());
|
assertEquals(r1.getSource(), r2.getTarget());
|
||||||
assertEquals(r2.getSource(), r1.getTarget());
|
assertEquals(r2.getSource(), r1.getTarget());
|
||||||
|
|
||||||
|
assertTrue(r1.getSource().startsWith("10|"));
|
||||||
|
assertTrue(r1.getTarget().startsWith("20|"));
|
||||||
|
|
||||||
|
assertEquals(ModelConstants.DATASOURCE_ORGANIZATION, r1.getRelType());
|
||||||
|
assertEquals(ModelConstants.DATASOURCE_ORGANIZATION, r2.getRelType());
|
||||||
|
|
||||||
|
assertEquals(ModelConstants.PROVISION, r1.getSubRelType());
|
||||||
|
assertEquals(ModelConstants.PROVISION, r2.getSubRelType());
|
||||||
|
|
||||||
|
assertEquals(ModelConstants.IS_PROVIDED_BY, r1.getRelClass());
|
||||||
|
assertEquals(ModelConstants.PROVIDES, r2.getRelClass());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||||
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
|
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||||
|
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
|
<header xmlns="http://namespace.openaire.eu/">
|
||||||
|
<dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
|
||||||
|
<dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
|
||||||
|
<dri:dateOfCollection/>
|
||||||
|
<dri:mdFormat/>
|
||||||
|
<dri:mdFormatInterpretation/>
|
||||||
|
<dri:repositoryId/>
|
||||||
|
<dr:objectIdentifier/>
|
||||||
|
<dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
|
||||||
|
<dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
|
||||||
|
<oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
|
||||||
|
</header>
|
||||||
|
<metadata xmlns="http://namespace.openaire.eu/">
|
||||||
|
<dc:title>多項式GCDを用いた復号法に関する研究<dc:title>
|
||||||
|
<dc:creator>上原, 剛</dc:creator>
|
||||||
|
<dc:creator>甲斐, 博</dc:creator>
|
||||||
|
<dc:creator>野田, 松太郎</dc:creator>
|
||||||
|
<dc:format>application/pdf</dc:format>
|
||||||
|
<dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
|
||||||
|
<dc:language>jpn</dc:language>
|
||||||
|
<dc:publisher>京都大学数理解析研究所</dc:publisher>
|
||||||
|
<dc:subject classid="ndc" classname="ndc"
|
||||||
|
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
|
||||||
|
<dc:type>Departmental Bulletin Paper</dc:type>
|
||||||
|
<dr:CobjCategory type="publication">0014</dr:CobjCategory>
|
||||||
|
<oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
|
||||||
|
<oaf:projectid/>
|
||||||
|
<oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
|
||||||
|
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||||
|
<oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
|
||||||
|
<oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
|
||||||
|
<oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
|
||||||
|
<oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
|
||||||
|
<oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
|
||||||
|
<oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
|
||||||
|
<oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
|
||||||
|
</metadata>
|
||||||
|
<about>
|
||||||
|
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||||
|
<originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
|
||||||
|
<baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
|
||||||
|
<identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
|
||||||
|
<datestamp>2021-04-13T13:36:29Z</datestamp>
|
||||||
|
<metadataNamespace/>
|
||||||
|
<originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
|
||||||
|
<baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
|
||||||
|
<identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
|
||||||
|
<datestamp>2012-07-12T14:15:41Z</datestamp>
|
||||||
|
<metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
|
||||||
|
</originDescription>
|
||||||
|
</originDescription>
|
||||||
|
</provenance>
|
||||||
|
<oaf:datainfo>
|
||||||
|
<oaf:inferred>false</oaf:inferred>
|
||||||
|
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||||
|
<oaf:trust>0.9</oaf:trust>
|
||||||
|
<oaf:inferenceprovenance/>
|
||||||
|
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
|
||||||
|
classname="sysimport:crosswalk:repository"
|
||||||
|
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||||
|
</oaf:datainfo>
|
||||||
|
</about>
|
||||||
|
</record>
|
|
@ -0,0 +1,8 @@
|
||||||
|
# Root logger option
|
||||||
|
log4j.rootLogger=DEBUG, stdout
|
||||||
|
|
||||||
|
# Direct log messages to stdout
|
||||||
|
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
|
||||||
|
log4j.appender.stdout.Target=System.out
|
||||||
|
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
|
||||||
|
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
|
6
pom.xml
6
pom.xml
|
@ -200,6 +200,12 @@
|
||||||
<version>${dhp.commons.lang.version}</version>
|
<version>${dhp.commons.lang.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-validator</groupId>
|
||||||
|
<artifactId>commons-validator</artifactId>
|
||||||
|
<version>1.7</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.sisyphsu</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>dateparser</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
|
|
Loading…
Reference in New Issue