Irish oaipmh exporter #443

Merged
claudio.atzori merged 7 commits from irish-oaipmh-exporter into beta 2024-06-05 10:55:09 +02:00
4 changed files with 155 additions and 36 deletions
Showing only changes of commit 81090ad593 - Show all commits

View File

@ -46,15 +46,16 @@ public class IrishOaiExporterJob {
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString(XmlConverterJob.class .toString(
.getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json"))); XmlConverterJob.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json")));
parser.parseArgument(args); parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath"); final String inputPath = parser.get("inputPath");
@ -62,9 +63,9 @@ public class IrishOaiExporterJob {
final String dbUser = parser.get("dbUser"); final String dbUser = parser.get("dbUser");
final String dbPwd = parser.get("dbPwd"); final String dbPwd = parser.get("dbPwd");
final int numConnections = Optional final int numConnections = Optional
.ofNullable(parser.get("numConnections")) .ofNullable(parser.get("numConnections"))
.map(Integer::valueOf) .map(Integer::valueOf)
.orElse(NUM_CONNECTIONS); .orElse(NUM_CONNECTIONS);
log.info("inputPath: '{}'", inputPath); log.info("inputPath: '{}'", inputPath);
log.info("dbUrl: '{}'", dbUrl); log.info("dbUrl: '{}'", dbUrl);
@ -78,29 +79,31 @@ public class IrishOaiExporterJob {
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
conf.registerKryoClasses(new Class[] { conf.registerKryoClasses(new Class[] {
SerializableSolrInputDocument.class SerializableSolrInputDocument.class
}); });
final Encoder<TupleWrapper> encoderTuple = Encoders.bean(TupleWrapper.class); final Encoder<TupleWrapper> encoderTuple = Encoders.bean(TupleWrapper.class);
final Encoder<OaiRecordWrapper> encoderOaiRecord = Encoders.bean(OaiRecordWrapper.class); final Encoder<OaiRecordWrapper> encoderOaiRecord = Encoders.bean(OaiRecordWrapper.class);
final String date = LocalDateTime.now().toString();
log.info("Creating temporary table..."); log.info("Creating temporary table...");
runWithSparkSession(conf, isSparkSessionManaged, spark -> { runWithSparkSession(conf, isSparkSessionManaged, spark -> {
final Dataset<OaiRecordWrapper> docs = spark final Dataset<OaiRecordWrapper> docs = spark
.read() .read()
.schema(encoderTuple.schema()) .schema(encoderTuple.schema())
.json(inputPath) .json(inputPath)
.as(encoderTuple) .as(encoderTuple)
.map((MapFunction<TupleWrapper, String>) TupleWrapper::getXml, Encoders.STRING()) .map((MapFunction<TupleWrapper, String>) TupleWrapper::getXml, Encoders.STRING())
.map((MapFunction<String, OaiRecordWrapper>) IrishOaiExporterJob::asIrishOaiResult, encoderOaiRecord) .map((MapFunction<String, OaiRecordWrapper>) r -> asIrishOaiResult(r, date), encoderOaiRecord)
.filter((FilterFunction<OaiRecordWrapper>) obj -> (obj != null) && StringUtils.isNotBlank(obj.getId())); .filter((FilterFunction<OaiRecordWrapper>) obj -> (obj != null) && StringUtils.isNotBlank(obj.getId()));
docs docs
.repartition(numConnections) .repartition(numConnections)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.jdbc(dbUrl, TMP_OAI_TABLE, connectionProperties); .jdbc(dbUrl, TMP_OAI_TABLE, connectionProperties);
}); });
log.info("Temporary table created."); log.info("Temporary table created.");
@ -108,14 +111,15 @@ public class IrishOaiExporterJob {
log.info("Updating OAI records..."); log.info("Updating OAI records...");
try (final Connection con = DriverManager.getConnection(dbUrl, dbUser, dbPwd)) { try (final Connection con = DriverManager.getConnection(dbUrl, dbUser, dbPwd)) {
try (final Statement st = con.createStatement()) { try (final Statement st = con.createStatement()) {
final String query = IOUtils.toString(IrishOaiExporterJob.class.getResourceAsStream("oai-finalize.sql")); final String query = IOUtils
.toString(IrishOaiExporterJob.class.getResourceAsStream("oai-finalize.sql"));
st.execute(query); st.execute(query);
} }
} }
log.info("DONE."); log.info("DONE.");
} }
protected static OaiRecordWrapper asIrishOaiResult(final String xml) { protected static OaiRecordWrapper asIrishOaiResult(final String xml, final String date) {
try { try {
final Document doc = DocumentHelper.parseText(xml); final Document doc = DocumentHelper.parseText(xml);
final OaiRecordWrapper r = new OaiRecordWrapper(); final OaiRecordWrapper r = new OaiRecordWrapper();
@ -123,7 +127,7 @@ public class IrishOaiExporterJob {
if (isValid(doc)) { if (isValid(doc)) {
r.setId(doc.valueOf("//*[local-name()='objIdentifier']").trim()); r.setId(doc.valueOf("//*[local-name()='objIdentifier']").trim());
r.setBody(gzip(doc.selectSingleNode("//*[local-name()='entity']").asXML())); r.setBody(gzip(doc.selectSingleNode("//*[local-name()='entity']").asXML()));
r.setDate(LocalDateTime.now().toString()); r.setDate(date);
r.setSets(new ArrayList<>()); r.setSets(new ArrayList<>());
} }
return r; return r;
@ -140,19 +144,25 @@ public class IrishOaiExporterJob {
if (n != null) { if (n != null) {
for (final Object o : n.selectNodes(".//*[local-name()='datainfo']/*[local-name()='deletedbyinference']")) { for (final Object o : n.selectNodes(".//*[local-name()='datainfo']/*[local-name()='deletedbyinference']")) {
if ("true".equals(((Node) o).getText().trim())) { return false; } if ("true".equals(((Node) o).getText().trim())) {
return false;
}
} }
// verify the main country of the result // verify the main country of the result
for (final Object o : n.selectNodes("./*[local-name()='country']")) { for (final Object o : n.selectNodes("./*[local-name()='country']")) {
if ("IE".equals(((Node) o).valueOf("@classid").trim())) { return true; } if ("IE".equals(((Node) o).valueOf("@classid").trim())) {
return true;
}
} }
// verify the countries of the related organizations // verify the countries of the related organizations
for (final Object o : n.selectNodes(".//*[local-name()='rel']")) { for (final Object o : n.selectNodes(".//*[local-name()='rel']")) {
final String relType = ((Node) o).valueOf("./*[local-name() = 'to']/@type").trim(); final String relType = ((Node) o).valueOf("./*[local-name() = 'to']/@type").trim();
final String relCountry = ((Node) o).valueOf("./*[local-name() = 'country']/@classid").trim(); final String relCountry = ((Node) o).valueOf("./*[local-name() = 'country']/@classid").trim();
if ("organization".equals(relType) && "IE".equals(relCountry)) { return true; } if ("organization".equals(relType) && "IE".equals(relCountry)) {
return true;
}
} }
} }
return false; return false;
@ -160,7 +170,9 @@ public class IrishOaiExporterJob {
} }
protected static byte[] gzip(final String str) { protected static byte[] gzip(final String str) {
if (StringUtils.isBlank(str)) { return null; } if (StringUtils.isBlank(str)) {
return null;
}
try (final ByteArrayOutputStream baos = new ByteArrayOutputStream()) { try (final ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
try (final GZIPOutputStream gzip = new GZIPOutputStream(baos)) { try (final GZIPOutputStream gzip = new GZIPOutputStream(baos)) {

View File

@ -0,0 +1,106 @@
<workflow-app name="irish-oaipmh-provision" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>inputPath</name>
<description>The path of the input records on HDFS</description>
</property>
<property>
<name>numConnections</name>
<description>number of connections to the postgres db (for the write operation)</description>
</property>
<property>
<name>dbUrl</name>
<description>the url of the database</description>
</property>
<property>
<name>dbUser</name>
<description>the user of the database</description>
</property>
<property>
<name>dbPwd</name>
<description>the password for the user of the database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="oaiphm_provision"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="irish_oaiphm_provision">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Irish OAI-PHM provision</name>
<class>eu.dnetlib.dhp.oa.oaipmh.IrishOaiExporterJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}</arg>
<arg>--numConnections</arg><arg>${numConnections}</arg>
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
<arg>--dbUser</arg><arg>${dbUser}</arg>
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -42,10 +42,10 @@ public class DbSerializationTest {
conf.set("spark.driver.host", "localhost"); conf.set("spark.driver.host", "localhost");
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName("TEST") .appName("TEST")
.config(conf) .config(conf)
.getOrCreate(); .getOrCreate();
} }
@AfterAll @AfterAll
@ -79,9 +79,9 @@ public class DbSerializationTest {
final Dataset<OaiRecordWrapper> docs = spark.createDataset(list, Encoders.bean(OaiRecordWrapper.class)); final Dataset<OaiRecordWrapper> docs = spark.createDataset(list, Encoders.bean(OaiRecordWrapper.class));
docs docs
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.jdbc(dbUrl, IrishOaiExporterJob.TMP_OAI_TABLE, connectionProperties); .jdbc(dbUrl, IrishOaiExporterJob.TMP_OAI_TABLE, connectionProperties);
}); });

View File

@ -10,6 +10,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.time.LocalDateTime;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -23,7 +24,7 @@ public class IrishOaiExporterJobTest {
@Test @Test
void testAsIrishOaiResult() throws Exception { void testAsIrishOaiResult() throws Exception {
final String xml = IOUtils.toString(getClass().getResourceAsStream("record_IE.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("record_IE.xml"));
final OaiRecordWrapper res = IrishOaiExporterJob.asIrishOaiResult(xml); final OaiRecordWrapper res = IrishOaiExporterJob.asIrishOaiResult(xml, LocalDateTime.now().toString());
assertNotNull(res.getId()); assertNotNull(res.getId());
assertNotNull(res.getBody()); assertNotNull(res.getBody());
assertNotNull(res.getSets()); assertNotNull(res.getSets());