adopting dhp-schemas:8.0.1 to support Auhtor's rawAffiliationString(s). Improved graph2hive implementation

This commit is contained in:
Claudio Atzori 2024-10-08 16:22:54 +02:00
parent d5867a1992
commit 62ff843334
13 changed files with 22 additions and 28 deletions

View File

@ -407,10 +407,9 @@ object DataciteToOAFTransformation {
)
}
if (c.affiliation.isDefined)
a.setAffiliation(
a.setRawAffiliationString(
c.affiliation.get
.filter(af => af.nonEmpty)
.map(af => OafMapperUtils.field(af, dataInfo))
.asJava
)
a.setRank(idx + 1)

View File

@ -313,7 +313,7 @@ case object ConversionUtil {
if (f.author.DisplayName.isDefined)
a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setRawAffiliationString(List(f.affiliation).asJava)
a.setPid(
List(
createSP(
@ -386,7 +386,7 @@ case object ConversionUtil {
a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setRawAffiliationString(List(f.affiliation).asJava)
a.setPid(
List(

View File

@ -9,10 +9,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -25,8 +22,6 @@ public class GraphHiveTableImporterJob {
private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -74,7 +69,12 @@ public class GraphHiveTableImporterJob {
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
Class<T> clazz, int numPartitions) {
Dataset<String> dataset = spark.read().textFile(inputPath);
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
Dataset<Row> dataset = spark
.read()
.schema(clazzEncoder.schema())
.json(inputPath);
if (numPartitions > 0) {
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
@ -82,7 +82,6 @@ public class GraphHiveTableImporterJob {
}
dataset
.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.saveAsTable(tableIdentifier(hiveDbName, clazz));

View File

@ -94,7 +94,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
}
author.setAffiliation(prepareListFields(n, "./*[local-name()='affiliation']", info));
author.setRawAffiliationString(prepareListString(n, "./*[local-name()='affiliation']"));
author.setPid(preparePids(n, info));
author.setRank(pos++);
res.add(author);

View File

@ -73,14 +73,10 @@ public class GraphHiveImporterJobTest {
GraphHiveImporterJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
"-hiveMetastoreUris",
"",
"-hiveDbName",
dbName
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
"--hiveMetastoreUris", "",
"--hiveDbName", dbName
});
ModelSupport.oafTypes

View File

@ -406,15 +406,15 @@ class MappersTest {
assertEquals("Baracchini", author.get().getSurname());
assertEquals("Theo", author.get().getName());
assertEquals(1, author.get().getAffiliation().size());
final Optional<Field<String>> opAff = author
assertEquals(1, author.get().getRawAffiliationString().size());
final Optional<String> opAff = author
.get()
.getAffiliation()
.getRawAffiliationString()
.stream()
.findFirst();
assertTrue(opAff.isPresent());
final Field<String> affiliation = opAff.get();
assertEquals("ISTI-CNR", affiliation.getValue());
final String affiliation = opAff.get();
assertEquals("ISTI-CNR", affiliation);
assertFalse(d.getSubject().isEmpty());
assertFalse(d.getInstance().isEmpty());

View File

@ -937,7 +937,7 @@
<commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version>
<dateparser.version>1.0.7</dateparser.version>
<dhp-schemas.version>[7.0.1]</dhp-schemas.version>
<dhp-schemas.version>[8.0.1]</dhp-schemas.version>
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version>