1
0
Fork 0

patch of mdstore records

This commit is contained in:
Michele Artini 2021-05-31 11:36:26 +02:00
parent ad56a44fda
commit e9f2b6037c
6 changed files with 448 additions and 30 deletions

View File

@ -1,24 +1,65 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.keyValue;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.util.*; import java.util.ArrayList;
import java.util.stream.Collectors; import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentFactory; import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper; import org.dom4j.DocumentHelper;
import org.dom4j.Node; import org.dom4j.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.AccessRight;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ -43,6 +84,8 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final Map<String, String> nsContext = new HashMap<>(); protected static final Map<String, String> nsContext = new HashMap<>();
private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class);
static { static {
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
@ -61,6 +104,9 @@ public abstract class AbstractMdRecordToOafMapper {
} }
public List<Oaf> processMdRecord(final String xml) { public List<Oaf> processMdRecord(final String xml) {
// log.info("Processing record: " + xml);
try { try {
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
@ -100,10 +146,10 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected String getResultType(final Document doc, final List<Instance> instances) { protected String getResultType(final Document doc, final List<Instance> instances) {
String type = doc.valueOf("//dr:CobjCategory/@type"); final String type = doc.valueOf("//dr:CobjCategory/@type");
if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
String instanceType = instances final String instanceType = instances
.stream() .stream()
.map(i -> i.getInstancetype().getClassid()) .map(i -> i.getInstancetype().getClassid())
.findFirst() .findFirst()
@ -158,8 +204,12 @@ public abstract class AbstractMdRecordToOafMapper {
return oafs; return oafs;
} }
private OafEntity createEntity(Document doc, String type, List<Instance> instances, KeyValue collectedFrom, private OafEntity createEntity(final Document doc,
DataInfo info, long lastUpdateTimestamp) { final String type,
final List<Instance> instances,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
switch (type.toLowerCase()) { switch (type.toLowerCase()) {
case "publication": case "publication":
final Publication p = new Publication(); final Publication p = new Publication();
@ -219,9 +269,7 @@ public abstract class AbstractMdRecordToOafMapper {
getRelation( getRelation(
docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate)); docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate));
res res
.add( .add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate));
getRelation(
projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate));
} }
} }
@ -411,10 +459,10 @@ public abstract class AbstractMdRecordToOafMapper {
return Lists.newArrayList(id); return Lists.newArrayList(id);
} }
} }
List<String> idList = doc final List<String> idList = doc
.selectNodes( .selectNodes(
"normalize-space(//*[local-name()='header']/*[local-name()='identifier' or local-name()='recordIdentifier']/text())"); "normalize-space(//*[local-name()='header']/*[local-name()='identifier' or local-name()='recordIdentifier']/text())");
Set<String> originalIds = Sets.newHashSet(idList); final Set<String> originalIds = Sets.newHashSet(idList);
if (originalIds.isEmpty()) { if (originalIds.isEmpty()) {
throw new IllegalStateException("missing originalID on " + doc.asXML()); throw new IllegalStateException("missing originalID on " + doc.asXML());
@ -423,8 +471,8 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected AccessRight prepareAccessRight(final Node node, final String xpath, final String schemeId) { protected AccessRight prepareAccessRight(final Node node, final String xpath, final String schemeId) {
Qualifier qualifier = prepareQualifier(node.valueOf(xpath).trim(), schemeId); final Qualifier qualifier = prepareQualifier(node.valueOf(xpath).trim(), schemeId);
AccessRight accessRight = new AccessRight(); final AccessRight accessRight = new AccessRight();
accessRight.setClassid(qualifier.getClassid()); accessRight.setClassid(qualifier.getClassid());
accessRight.setClassname(qualifier.getClassname()); accessRight.setClassname(qualifier.getClassname());
accessRight.setSchemeid(qualifier.getSchemeid()); accessRight.setSchemeid(qualifier.getSchemeid());

View File

@ -3,7 +3,10 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.StringReader;
import java.text.SimpleDateFormat;
import java.util.Arrays; import java.util.Arrays;
import java.util.Date;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.UUID; import java.util.UUID;
@ -24,6 +27,11 @@ import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Namespace;
import org.dom4j.QName;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -38,6 +46,8 @@ import scala.Tuple2;
public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication { public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication {
private static final Logger log = LoggerFactory.getLogger(MigrateHdfsMdstoresApplication.class); private static final Logger log = LoggerFactory.getLogger(MigrateHdfsMdstoresApplication.class);
private static final Namespace DRI_NS_PREFIX = new Namespace("dri",
"http://www.driver-repository.eu/namespace/dri");
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -87,7 +97,7 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
spark spark
.read() .read()
.parquet(validPaths) .parquet(validPaths)
.map((MapFunction<Row, String>) r -> r.getAs("body"), Encoders.STRING()) .map((MapFunction<Row, String>) r -> enrichRecord(r), Encoders.STRING())
.toJavaRDD() .toJavaRDD()
.mapToPair(xml -> new Tuple2<>(new Text(UUID.randomUUID() + ":" + type), new Text(xml))) .mapToPair(xml -> new Tuple2<>(new Text(UUID.randomUUID() + ":" + type), new Text(xml)))
.coalesce(1) .coalesce(1)
@ -99,6 +109,26 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
*/ */
} }
private static String enrichRecord(final Row r) {
final String xml = r.getAs("body");
final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
final String collDate = dateFormat.format(new Date((Long) r.getAs("dateOfCollection")));
final String tranDate = dateFormat.format(new Date((Long) r.getAs("dateOfTransformation")));
try {
final Document doc = new SAXReader().read(new StringReader(xml));
final Element head = (Element) doc.selectSingleNode("//*[local-name() = 'header']");
head.addElement(new QName("objIdentifier", DRI_NS_PREFIX)).addText(r.getAs("id"));
head.addElement(new QName("dateOfCollection", DRI_NS_PREFIX)).addText(collDate);
head.addElement(new QName("dateOfTransformation", DRI_NS_PREFIX)).addText(tranDate);
return doc.asXML();
} catch (final Exception e) {
log.error("Error patching record: " + xml);
throw new RuntimeException("Error patching record: " + xml, e);
}
}
private static Set<String> mdstorePaths(final String mdstoreManagerUrl, private static Set<String> mdstorePaths(final String mdstoreManagerUrl,
final String format, final String format,
final String layout, final String layout,

View File

@ -0,0 +1,157 @@
<workflow-app name="Test Import of Hdfs Stores" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphOutputPath</name>
<description>the target path to store raw graph</description>
</property>
<property>
<name>contentPath</name>
<description>path location to store (or reuse) content from the aggregator</description>
</property>
<property>
<name>mdstoreManagerUrl</name>
<description>the address of the Mdstore Manager</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the address of the lookUp service</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="ImportODF_hdfs"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ImportODF_hdfs">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ImportODF_hdfs</name>
<class>eu.dnetlib.dhp.oa.graph.raw.MigrateHdfsMdstoresApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--hdfsPath</arg><arg>${contentPath}/odf_records_hdfs</arg>
<arg>--mdstoreManagerUrl</arg><arg>${mdstoreManagerUrl}</arg>
<arg>--mdFormat</arg><arg>ODF</arg>
<arg>--mdLayout</arg><arg>store</arg>
<arg>--mdInterpretation</arg><arg>cleaned</arg>
</spark>
<ok to="GenerateEntities"/>
<error to="Kill"/>
</action>
<action name="GenerateEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateEntities</name>
<class>eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/odf_records_hdfs</arg>
<arg>--targetPath</arg><arg>${workingDir}/entities</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--shouldHashId</arg><arg>${shouldHashId}</arg>
</spark>
<ok to="GenerateGraph"/>
<error to="Kill"/>
</action>
<action name="GenerateGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateGraph</name>
<class>eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/entities</arg>
<arg>--graphRawPath</arg><arg>${workingDir}/graph_raw</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -100,10 +100,58 @@
<arg>--mdLayout</arg><arg>store</arg> <arg>--mdLayout</arg><arg>store</arg>
<arg>--mdInterpretation</arg><arg>cleaned</arg> <arg>--mdInterpretation</arg><arg>cleaned</arg>
</spark> </spark>
<ok to="End"/> <ok to="GenerateEntities"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="GenerateEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateEntities</name>
<class>eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/odf_records_hdfs</arg>
<arg>--targetPath</arg><arg>${workingDir}/entities</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--shouldHashId</arg><arg>${shouldHashId}</arg>
</spark>
<ok to="GenerateGraph"/>
<error to="Kill"/>
</action>
<action name="GenerateGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateGraph</name>
<class>eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/entities</arg>
<arg>--graphRawPath</arg><arg>${workingDir}/graph_raw</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,7 +1,11 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;
import java.io.IOException; import java.io.IOException;
@ -21,7 +25,15 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.oa.graph.clean.GraphCleaningFunctionsTest; import eu.dnetlib.dhp.oa.graph.clean.GraphCleaningFunctionsTest;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -407,9 +419,10 @@ public class MappersTest {
assertNotNull(d.getTitle()); assertNotNull(d.getTitle());
assertEquals(1, d.getTitle().size()); assertEquals(1, d.getTitle().size());
assertEquals( assertEquals("Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia", d
"Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia", .getTitle()
d.getTitle().get(0).getValue()); .get(0)
.getValue());
assertNotNull(d.getDescription()); assertNotNull(d.getDescription());
assertEquals(1, d.getDescription().size()); assertEquals(1, d.getDescription().size());
@ -435,7 +448,7 @@ public class MappersTest {
assertNotNull(d.getInstance()); assertNotNull(d.getInstance());
assertTrue(d.getInstance().size() == 1); assertTrue(d.getInstance().size() == 1);
Instance i = d.getInstance().get(0); final Instance i = d.getInstance().get(0);
assertNotNull(i.getAccessright()); assertNotNull(i.getAccessright());
assertEquals(ModelConstants.DNET_ACCESS_MODES, i.getAccessright().getSchemeid()); assertEquals(ModelConstants.DNET_ACCESS_MODES, i.getAccessright().getSchemeid());
@ -607,8 +620,7 @@ public class MappersTest {
assertEquals("OPEN", p.getInstance().get(0).getAccessright().getClassid()); assertEquals("OPEN", p.getInstance().get(0).getAccessright().getClassid());
assertValidId(p.getInstance().get(0).getCollectedfrom().getKey()); assertValidId(p.getInstance().get(0).getCollectedfrom().getKey());
assertValidId(p.getInstance().get(0).getHostedby().getKey()); assertValidId(p.getInstance().get(0).getHostedby().getKey());
assertEquals( assertEquals("http://creativecommons.org/licenses/by/3.0/de/legalcode", p.getInstance().get(0).getLicense().getValue());
"http://creativecommons.org/licenses/by/3.0/de/legalcode", p.getInstance().get(0).getLicense().getValue());
assertEquals(1, p.getInstance().size()); assertEquals(1, p.getInstance().size());
assertNotNull(p.getInstance().get(0).getAlternateIdentifier()); assertNotNull(p.getInstance().get(0).getAlternateIdentifier());
@ -633,7 +645,55 @@ public class MappersTest {
System.out.println(p.getTitle().get(0).getValue()); System.out.println(p.getTitle().get(0).getValue());
} }
@Test
void testOdfFromHdfs() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(1, list.size());
System.out.println(list.get(0).getClass());
assertTrue(list.get(0) instanceof Dataset);
final Dataset p = (Dataset) list.get(0);
assertValidId(p.getId());
assertTrue(p.getOriginalId().size() == 1);
assertEquals("df76e73f-0483-49a4-a9bb-63f2f985574a", p.getOriginalId().get(0));
assertValidId(p.getCollectedfrom().get(0).getKey());
assertTrue(p.getAuthor().size() > 0);
final Optional<Author> author = p
.getAuthor()
.stream()
.findFirst();
assertTrue(author.isPresent());
assertEquals("Museum Sønderjylland", author.get().getFullname());
assertTrue(p.getSubject().size() > 0);
assertTrue(p.getInstance().size() > 0);
assertNotNull(p.getTitle());
assertFalse(p.getTitle().isEmpty());
assertNotNull(p.getInstance());
assertTrue(p.getInstance().size() > 0);
p
.getInstance()
.stream()
.forEach(i -> {
assertNotNull(i.getAccessright());
assertEquals("UNKNOWN", i.getAccessright().getClassid());
});
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
}
private void assertValidId(final String id) { private void assertValidId(final String id) {
System.out.println(id);
assertEquals(49, id.length()); assertEquals(49, id.length());
assertEquals('|', id.charAt(2)); assertEquals('|', id.charAt(2));
assertEquals(':', id.charAt(15)); assertEquals(':', id.charAt(15));
@ -642,14 +702,12 @@ public class MappersTest {
private List<String> vocs() throws IOException { private List<String> vocs() throws IOException {
return IOUtils return IOUtils
.readLines( .readLines(GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
} }
private List<String> synonyms() throws IOException { private List<String> synonyms() throws IOException {
return IOUtils return IOUtils
.readLines( .readLines(GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
} }
} }

View File

@ -0,0 +1,77 @@
<record xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:datacite="http://datacite.org/schema/kernel-3"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri">
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<identifier>df76e73f-0483-49a4-a9bb-63f2f985574a</identifier>
<datestamp>2020-09-30T08:17:54Z</datestamp>
<setSpec>eudat-b2find</setSpec>
<dr:dateOfTransformation>2021-05-20T13:43:52.888Z</dr:dateOfTransformation>
<dri:objIdentifier>test________::92fe3efa47883b2f3401e6a4bd92e9d7</dri:objIdentifier>
<dri:dateOfCollection>2020-05-21T05:26:15.93Z</dri:dateOfCollection>
<dri:dateOfTransformation>2020-08-01T11:06:26.977Z</dri:dateOfTransformation>
</header>
<metadata>
<resource xmlns="http://datacite.org/schema/kernel-4"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.3/metadata.xsd">
<creators>
<creator>
<creatorName>Museum Sønderjylland</creatorName>
</creator>
</creators>
<titles>
<title>200202-124 Hjelmvrå</title>
</titles>
<descriptions>
<description descriptionType="Abstract">This record describes
ancient sites and monuments as well archaeological excavations
undertaken by Danish museums. Excerpt of the Danish description of
events: 1995-04-26: Ved en besigtigelse ud for stedet fandt Nørgård
en større mængde skår i skovens udkant, liggende i nogle
drængrøfter1995-04-26: Leif Nørgård, der er leder af Sønderjyllands
Amatørarkæologer, havde ved en samtale med en tidligere ansat på
motorvejsprojektet gennem Sønderjylland fået at vide, at man på
dette sted havde fundet "urner".1995-04-26: Ved en besigtigelse ud
for stedet fandt Nørgård en større mængde skår i skovens udkant,
liggende i nogle drængrøfter1995-04-26: Leif Nørgård, der er leder
af Sønderjyllands Amatørarkæologer, havde ved en samtale med en
tidligere ansat på motorvejsprojektet gennem Sønderjylland fået at
vide, at man på dette sted havde fundet "urner".</description>
</descriptions>
<geoLocations>
<geoLocation>
<geoLocationPlace>(9.376 LON, 55.220 LAT)</geoLocationPlace>
</geoLocation>
</geoLocations>
<subjects>
<subject>Enkeltfund</subject>
<subject>Settlement</subject>
<subject>Single find</subject>
<subject>Archaeology</subject>
</subjects>
<alternateIdentifiers
xmlns="http://datacite.org/schema/kernel-3">
<alternateIdentifier
xmlns="http://datacite.org/schema/kernel-4"
alternateIdentifierType="URL">http://www.kulturarv.dk/fundogfortidsminder/Lokalitet/136540/</alternateIdentifier>
</alternateIdentifiers>
<publicationYear>2020</publicationYear>
<publisher>Slots- og Kulturstyrelsen (www.slks.dk)</publisher>
<language>Danish</language>
<rightsList>
<rights>Public</rights>
</rightsList>
<resourceType resourceTypeGeneral="Other">Dataset</resourceType>
</resource>
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
<oaf:dateAccepted>2020-01-01</oaf:dateAccepted>
<oaf:accessrights>UNKNOWN</oaf:accessrights>
<oaf:language>Danish</oaf:language>
<oaf:hostedBy name="B2FIND"
id="re3data_____::r3d100012377" />
<oaf:collectedFrom name="B2FIND"
id="re3data_____::r3d100012377" />
</metadata>
</record>