merged from branch discard-non-wellformed

This commit is contained in:
Claudio Atzori 2022-09-19 10:17:10 +02:00
commit 192215a18e
10 changed files with 331 additions and 37 deletions

View File

@ -14,6 +14,8 @@ import java.util.*;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.*; import org.dom4j.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
@ -50,6 +52,8 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final Map<String, String> nsContext = new HashMap<>(); protected static final Map<String, String> nsContext = new HashMap<>();
private static final Logger log = LoggerFactory.getLogger(AbstractMdRecordToOafMapper.class);
static { static {
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
@ -76,40 +80,44 @@ public abstract class AbstractMdRecordToOafMapper {
this.forceOriginalId = false; this.forceOriginalId = false;
} }
public List<Oaf> processMdRecord(final String xml) throws DocumentException { public List<Oaf> processMdRecord(final String xml) {
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
try {
final Document doc = DocumentHelper
.parseText(
xml
.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)
.replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3)
.replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3));
final Document doc = DocumentHelper final KeyValue collectedFrom = getProvenanceDatasource(
.parseText( doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");
xml
.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)
.replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3)
.replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3));
final KeyValue collectedFrom = getProvenanceDatasource( if (collectedFrom == null) {
doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); return Lists.newArrayList();
}
if (collectedFrom == null) { final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
? collectedFrom
: getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name");
if (hostedBy == null) {
return Lists.newArrayList();
}
final DataInfo info = prepareDataInfo(doc, invisible);
final long lastUpdateTimestamp = new Date().getTime();
final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
final String type = getResultType(doc, instances);
return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
} catch (DocumentException e) {
log.error("Error with record:\n" + xml);
return Lists.newArrayList(); return Lists.newArrayList();
} }
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
? collectedFrom
: getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name");
if (hostedBy == null) {
return Lists.newArrayList();
}
final DataInfo info = prepareDataInfo(doc, invisible);
final long lastUpdateTimestamp = new Date().getTime();
final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
final String type = getResultType(doc, instances);
return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
} }
protected String getResultType(final Document doc, final List<Instance> instances) { protected String getResultType(final Document doc, final List<Instance> instances) {

View File

@ -16,6 +16,9 @@ import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -127,8 +130,8 @@ public class GenerateEntitiesApplication {
.sequenceFile(sp, Text.class, Text.class) .sequenceFile(sp, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) .map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs)) .map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs))
.filter(Objects::nonNull) .flatMap(List::iterator)
.flatMap(List::iterator)); .filter(Objects::nonNull));
} }
switch (mode) { switch (mode) {
@ -155,11 +158,11 @@ public class GenerateEntitiesApplication {
.saveAsTextFile(targetPath, GzipCodec.class); .saveAsTextFile(targetPath, GzipCodec.class);
} }
private static List<Oaf> convertToListOaf( public static List<Oaf> convertToListOaf(
final String id, final String id,
final String s, final String s,
final boolean shouldHashId, final boolean shouldHashId,
final VocabularyGroup vocs) throws DocumentException { final VocabularyGroup vocs) {
final String type = StringUtils.substringAfter(id, ":"); final String type = StringUtils.substringAfter(id, ":");
switch (type.toLowerCase()) { switch (type.toLowerCase()) {
@ -200,8 +203,7 @@ public class GenerateEntitiesApplication {
try { try {
return OBJECT_MAPPER.readValue(s, clazz); return OBJECT_MAPPER.readValue(s, clazz);
} catch (final Exception e) { } catch (final Exception e) {
log.error("Error parsing object of class: {}", clazz); log.error("Error parsing object of class: {}:\n{}", clazz, s);
log.error(s);
throw new IllegalArgumentException(e); throw new IllegalArgumentException(e);
} }
} }

View File

@ -0,0 +1,108 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class VerifyRecordsApplication {
private static final Logger log = LoggerFactory.getLogger(VerifyRecordsApplication.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
VerifyRecordsApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String sourcePaths = parser.get("sourcePaths");
log.info("sourcePaths: {}", sourcePaths);
final String invalidPath = parser.get("invalidPath");
log.info("invalidPath: {}", invalidPath);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
HdfsSupport.remove(invalidPath, spark.sparkContext().hadoopConfiguration());
validateRecords(spark, sourcePaths, invalidPath, vocs);
});
}
private static void validateRecords(SparkSession spark, String sourcePaths, String invalidPath,
VocabularyGroup vocs) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final List<String> existingSourcePaths = Arrays
.stream(sourcePaths.split(","))
.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
.collect(Collectors.toList());
log.info("Verify records in files:");
existingSourcePaths.forEach(log::info);
for (final String sp : existingSourcePaths) {
RDD<String> invalidRecords = sc
.sequenceFile(sp, Text.class, Text.class)
.map(k -> tryApplyMapping(k._1().toString(), k._2().toString(), true, vocs))
.filter(Objects::nonNull)
.rdd();
spark
.createDataset(invalidRecords, Encoders.STRING())
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.text(invalidPath);
}
}
private static String tryApplyMapping(
final String id,
final String xmlRecord,
final boolean shouldHashId,
final VocabularyGroup vocs) {
final List<Oaf> oaf = GenerateEntitiesApplication.convertToListOaf(id, xmlRecord, shouldHashId, vocs);
if (Optional.ofNullable(oaf).map(List::isEmpty).orElse(false)) {
return xmlRecord;
}
return null;
}
}

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -24,8 +25,11 @@ import org.apache.http.impl.client.HttpClients;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.oa.graph.raw.OafToOafMapper;
import eu.dnetlib.dhp.oa.graph.raw.OdfToOafMapper;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo; import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
public class AbstractMigrationApplication implements Closeable { public class AbstractMigrationApplication implements Closeable {

View File

@ -446,10 +446,34 @@
<join name="wait_import" to="fork_generate_entities"/> <join name="wait_import" to="fork_generate_entities"/>
<fork name="fork_generate_entities"> <fork name="fork_generate_entities">
<path start="GenerateEntities_claim"/> <path start="VerifyRecords_claim"/>
<path start="GenerateEntities"/> <path start="VerifyRecords"/>
</fork> </fork>
<action name="VerifyRecords_claim">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>VerifyRecords_claim</name>
<class>eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
<arg>--invalidPath</arg><arg>${workingDir}/invalid_records_claim</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="GenerateEntities_claim"/>
<error to="Kill"/>
</action>
<action name="GenerateEntities_claim"> <action name="GenerateEntities_claim">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -499,6 +523,30 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="VerifyRecords">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>VerifyRecords</name>
<class>eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible</arg>
<arg>--invalidPath</arg><arg>${workingDir}/invalid_records</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="GenerateEntities"/>
<error to="Kill"/>
</action>
<action name="GenerateEntities"> <action name="GenerateEntities">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>

View File

@ -0,0 +1,26 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "s",
"paramLongName": "sourcePaths",
"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "invalidPath",
"paramDescription": "the path of the invalid records file",
"paramRequired": false
},
{
"paramName": "isu",
"paramLongName": "isLookupUrl",
"paramDescription": "the url of the ISLookupService",
"paramRequired": true
}
]

View File

@ -21,7 +21,6 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
@ -948,6 +947,15 @@ class MappersTest {
} }
@Test
void testNotWellFormed() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
final List<Oaf> actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
assertNotNull(actual);
assertTrue(actual.isEmpty());
}
private void assertValidId(final String id) { private void assertValidId(final String id) {
// System.out.println(id); // System.out.println(id);

View File

@ -251,6 +251,18 @@ class MigrateDbEntitiesApplicationTest {
assertValidId(r2.getSource()); assertValidId(r2.getSource());
assertEquals(r1.getSource(), r2.getTarget()); assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget()); assertEquals(r2.getSource(), r1.getTarget());
assertTrue(r1.getSource().startsWith("10|"));
assertTrue(r1.getTarget().startsWith("20|"));
assertEquals(ModelConstants.DATASOURCE_ORGANIZATION, r1.getRelType());
assertEquals(ModelConstants.DATASOURCE_ORGANIZATION, r2.getRelType());
assertEquals(ModelConstants.PROVISION, r1.getSubRelType());
assertEquals(ModelConstants.PROVISION, r2.getSubRelType());
assertEquals(ModelConstants.IS_PROVIDED_BY, r1.getRelClass());
assertEquals(ModelConstants.PROVIDES, r2.getRelClass());
} }
@Test @Test

View File

@ -0,0 +1,70 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header xmlns="http://namespace.openaire.eu/">
<dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
<dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<dr:objectIdentifier/>
<dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
<dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
<oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
</header>
<metadata xmlns="http://namespace.openaire.eu/">
<dc:title>多項式GCDを用いた復号法に関する研究<dc:title>
<dc:creator>上原, 剛</dc:creator>
<dc:creator>甲斐, 博</dc:creator>
<dc:creator>野田, 松太郎</dc:creator>
<dc:format>application/pdf</dc:format>
<dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
<dc:language>jpn</dc:language>
<dc:publisher>京都大学数理解析研究所</dc:publisher>
<dc:subject classid="ndc" classname="ndc"
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
<dc:type>Departmental Bulletin Paper</dc:type>
<dr:CobjCategory type="publication">0014</dr:CobjCategory>
<oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
<oaf:projectid/>
<oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
<oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
<oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
<oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
<oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
<oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
<oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
</metadata>
<about>
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
<baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
<identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
<datestamp>2021-04-13T13:36:29Z</datestamp>
<metadataNamespace/>
<originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
<baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
<identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
<datestamp>2012-07-12T14:15:41Z</datestamp>
<metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
</originDescription>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
classname="sysimport:crosswalk:repository"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -0,0 +1,8 @@
# Root logger option
log4j.rootLogger=DEBUG, stdout
# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n