forked from D-Net/dnet-hadoop
logs for non well formed XML files
This commit is contained in:
parent
b09d7ddc74
commit
27af5122d2
|
@ -22,6 +22,8 @@ import java.util.*;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.*;
|
import org.dom4j.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
@ -77,6 +79,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected static final Map<String, String> nsContext = new HashMap<>();
|
protected static final Map<String, String> nsContext = new HashMap<>();
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(AbstractMdRecordToOafMapper.class);
|
||||||
|
|
||||||
static {
|
static {
|
||||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||||
|
@ -106,37 +110,41 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
public List<Oaf> processMdRecord(final String xml) throws DocumentException {
|
public List<Oaf> processMdRecord(final String xml) throws DocumentException {
|
||||||
|
|
||||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||||
|
try {
|
||||||
|
final Document doc = DocumentHelper
|
||||||
|
.parseText(
|
||||||
|
xml
|
||||||
|
.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)
|
||||||
|
.replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3)
|
||||||
|
.replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3));
|
||||||
|
|
||||||
final Document doc = DocumentHelper
|
final KeyValue collectedFrom = getProvenanceDatasource(
|
||||||
.parseText(
|
doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");
|
||||||
xml
|
|
||||||
.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)
|
|
||||||
.replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3)
|
|
||||||
.replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3));
|
|
||||||
|
|
||||||
final KeyValue collectedFrom = getProvenanceDatasource(
|
if (collectedFrom == null) {
|
||||||
doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");
|
return Lists.newArrayList();
|
||||||
|
}
|
||||||
|
|
||||||
if (collectedFrom == null) {
|
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
|
||||||
return Lists.newArrayList();
|
? collectedFrom
|
||||||
|
: getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name");
|
||||||
|
|
||||||
|
if (hostedBy == null) {
|
||||||
|
return Lists.newArrayList();
|
||||||
|
}
|
||||||
|
|
||||||
|
final DataInfo info = prepareDataInfo(doc, invisible);
|
||||||
|
final long lastUpdateTimestamp = new Date().getTime();
|
||||||
|
|
||||||
|
final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
|
||||||
|
|
||||||
|
final String type = getResultType(doc, instances);
|
||||||
|
|
||||||
|
return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||||
|
} catch (DocumentException e) {
|
||||||
|
log.error("Error with record:\n" + xml);
|
||||||
|
throw e;
|
||||||
}
|
}
|
||||||
|
|
||||||
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
|
|
||||||
? collectedFrom
|
|
||||||
: getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name");
|
|
||||||
|
|
||||||
if (hostedBy == null) {
|
|
||||||
return Lists.newArrayList();
|
|
||||||
}
|
|
||||||
|
|
||||||
final DataInfo info = prepareDataInfo(doc, invisible);
|
|
||||||
final long lastUpdateTimestamp = new Date().getTime();
|
|
||||||
|
|
||||||
final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
|
|
||||||
|
|
||||||
final String type = getResultType(doc, instances);
|
|
||||||
|
|
||||||
return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getResultType(final Document doc, final List<Instance> instances) {
|
protected String getResultType(final Document doc, final List<Instance> instances) {
|
||||||
|
|
|
@ -12,6 +12,7 @@ import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import kotlin.jvm.Throws;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
|
@ -926,6 +927,17 @@ class MappersTest {
|
||||||
// assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
// assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testNotWellFormed() throws IOException, DocumentException {
|
||||||
|
final String xml = IOUtils
|
||||||
|
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
|
||||||
|
final DocumentException generalEx = new DocumentException();
|
||||||
|
|
||||||
|
DocumentException exception = assertThrows(DocumentException.class, () -> {
|
||||||
|
new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
private void assertValidId(final String id) {
|
private void assertValidId(final String id) {
|
||||||
// System.out.println(id);
|
// System.out.println(id);
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||||
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
|
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||||
|
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
|
<header xmlns="http://namespace.openaire.eu/">
|
||||||
|
<dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
|
||||||
|
<dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
|
||||||
|
<dri:dateOfCollection/>
|
||||||
|
<dri:mdFormat/>
|
||||||
|
<dri:mdFormatInterpretation/>
|
||||||
|
<dri:repositoryId/>
|
||||||
|
<dr:objectIdentifier/>
|
||||||
|
<dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
|
||||||
|
<dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
|
||||||
|
<oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
|
||||||
|
</header>
|
||||||
|
<metadata xmlns="http://namespace.openaire.eu/">
|
||||||
|
<dc:title>多項式GCDを用いた復号法に関する研究<dc:title>
|
||||||
|
<dc:creator>上原, 剛</dc:creator>
|
||||||
|
<dc:creator>甲斐, 博</dc:creator>
|
||||||
|
<dc:creator>野田, 松太郎</dc:creator>
|
||||||
|
<dc:format>application/pdf</dc:format>
|
||||||
|
<dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
|
||||||
|
<dc:language>jpn</dc:language>
|
||||||
|
<dc:publisher>京都大学数理解析研究所</dc:publisher>
|
||||||
|
<dc:subject classid="ndc" classname="ndc"
|
||||||
|
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
|
||||||
|
<dc:type>Departmental Bulletin Paper</dc:type>
|
||||||
|
<dr:CobjCategory type="publication">0014</dr:CobjCategory>
|
||||||
|
<oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
|
||||||
|
<oaf:projectid/>
|
||||||
|
<oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
|
||||||
|
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||||
|
<oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
|
||||||
|
<oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
|
||||||
|
<oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
|
||||||
|
<oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
|
||||||
|
<oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
|
||||||
|
<oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
|
||||||
|
<oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
|
||||||
|
</metadata>
|
||||||
|
<about>
|
||||||
|
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||||
|
<originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
|
||||||
|
<baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
|
||||||
|
<identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
|
||||||
|
<datestamp>2021-04-13T13:36:29Z</datestamp>
|
||||||
|
<metadataNamespace/>
|
||||||
|
<originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
|
||||||
|
<baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
|
||||||
|
<identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
|
||||||
|
<datestamp>2012-07-12T14:15:41Z</datestamp>
|
||||||
|
<metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
|
||||||
|
</originDescription>
|
||||||
|
</originDescription>
|
||||||
|
</provenance>
|
||||||
|
<oaf:datainfo>
|
||||||
|
<oaf:inferred>false</oaf:inferred>
|
||||||
|
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||||
|
<oaf:trust>0.9</oaf:trust>
|
||||||
|
<oaf:inferenceprovenance/>
|
||||||
|
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
|
||||||
|
classname="sysimport:crosswalk:repository"
|
||||||
|
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||||
|
</oaf:datainfo>
|
||||||
|
</about>
|
||||||
|
</record>
|
|
@ -0,0 +1,8 @@
|
||||||
|
# Root logger option
|
||||||
|
log4j.rootLogger=DEBUG, stdout
|
||||||
|
|
||||||
|
# Direct log messages to stdout
|
||||||
|
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
|
||||||
|
log4j.appender.stdout.Target=System.out
|
||||||
|
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
|
||||||
|
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
|
Loading…
Reference in New Issue