imported more diffs from master branch; code formatting

This commit is contained in:
Claudio Atzori 2020-12-10 16:14:16 +01:00
parent 1eaad89a3c
commit d9532446eb
7 changed files with 83 additions and 54 deletions

View File

@ -7,6 +7,10 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class ModelConstants { public class ModelConstants {
public static final String ORCID = "orcid";
public static final String ORCID_PENDING = "orcid_pending";
public static final String ORCID_CLASSNAME = "Open Researcher and Contributor ID";
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"; public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254"; public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";

View File

@ -9,8 +9,8 @@ import java.util.List;
*/ */
public class BipScore implements Serializable { public class BipScore implements Serializable {
private String id; //doi private String id; // doi
private List<Score> scoreList; //unit as given in the inputfile private List<Score> scoreList; // unit as given in the inputfile
public String getId() { public String getId() {
return id; return id;

View File

@ -144,7 +144,6 @@ public class SparkAtomicActionScoreJob implements Serializable {
} }
private static List<Measure> getMeasure(BipScore value) { private static List<Measure> getMeasure(BipScore value) {
return value return value
.getScoreList() .getScoreList()

View File

@ -10,7 +10,6 @@ import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.fasterxml.jackson.databind.DeserializationFeature;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -22,6 +21,7 @@ import org.apache.spark.sql.expressions.Aggregator;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration; import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.DocumentContext;
@ -44,7 +44,7 @@ public class GroupEntitiesSparkJob {
private final static String ID_JPATH = "$.id"; private final static String ID_JPATH = "$.id";
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper() private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {

View File

@ -1,7 +1,7 @@
package eu.dnetlib.doiboost.orcid package eu.dnetlib.doiboost.orcid
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
import eu.dnetlib.dhp.schema.oaf.{Author, Publication} import eu.dnetlib.dhp.schema.orcid.OrcidDOI
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier} import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
import org.apache.commons.lang.StringUtils import org.apache.commons.lang.StringUtils
@ -44,23 +44,19 @@ object ORCIDToOAF {
} }
def convertTOOAF(input:ORCIDElement) :Publication = { def convertTOOAF(input:OrcidDOI) :Publication = {
val doi = input.doi val doi = input.getDoi
val pub:Publication = new Publication val pub:Publication = new Publication
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava) pub.setPid(List(createSP(doi.toLowerCase, "doi", PID_TYPES)).asJava)
pub.setDataInfo(generateDataInfo()) pub.setDataInfo(generateDataInfo())
//IMPORTANT
//The old method pub.setId(IdentifierFactory.createIdentifier(pub))
//will be replaced using IdentifierFactory
pub.setId(generateIdentifier(pub, doi.toLowerCase)) pub.setId(generateIdentifier(pub, doi.toLowerCase))
pub.setId(IdentifierFactory.createIdentifier(pub))
try{ try{
pub.setAuthor(input.authors.map(a=> {
generateAuthor(a.name, a.surname, a.creditName, a.oid) val l:List[Author]= input.getAuthors.asScala.map(a=> {
}).asJava) generateAuthor(a.getName, a.getSurname, a.getCreditName, a.getOid)
})(collection.breakOut)
pub.setAuthor(l.asJava)
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava) pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo()) pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
pub pub
@ -71,6 +67,13 @@ object ORCIDToOAF {
} }
} }
def generateOricPIDDatainfo():DataInfo = {
val di =DoiBoostMappingUtil.generateDataInfo("0.91")
di.getProvenanceaction.setClassid("sysimport:crosswalk:entityregistry")
di.getProvenanceaction.setClassname("Harvested")
di
}
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = { def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
val a = new Author val a = new Author
a.setName(given) a.setName(given)
@ -80,10 +83,10 @@ object ORCIDToOAF {
else else
a.setFullname(s"$given $family") a.setFullname(s"$given $family")
if (StringUtils.isNotBlank(orcid)) if (StringUtils.isNotBlank(orcid))
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava) a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateOricPIDDatainfo())).asJava)
a a
} }
} }

View File

@ -45,24 +45,24 @@ object SparkConvertORCIDToOAF {
Encoders.kryo(classOf[Publication]) Encoders.kryo(classOf[Publication])
} }
def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = { def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI] implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs) implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES) mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI]))) val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI])))
logger.info("Converting ORCID to OAF") logger.info("Converting ORCID to OAF")
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null) dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null)
.map(d => (d.getId, d)) .map(d => (d.getId, d))
.groupByKey(_._1)(Encoders.STRING) .groupByKey(_._1)(Encoders.STRING)
.agg(getPublicationAggregator().toColumn) .agg(getPublicationAggregator().toColumn)
.map(p => p._2) .map(p => p._2)
.write.mode(SaveMode.Overwrite).save(targetPath) .write.mode(SaveMode.Overwrite).save(targetPath)
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
@ -85,4 +85,4 @@ def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
} }
} }

View File

@ -10,6 +10,7 @@ import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import java.util.Optional; import java.util.Optional;
import javax.swing.text.html.Option;
import javax.xml.transform.Transformer; import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerException;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
@ -42,6 +43,10 @@ public class XmlIndexingJob {
private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class); private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class);
public enum OutputFormat {
SOLR, HDFS
}
private static final Integer DEFAULT_BATCH_SIZE = 1000; private static final Integer DEFAULT_BATCH_SIZE = 1000;
protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'"; protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
@ -52,6 +57,8 @@ public class XmlIndexingJob {
private int batchSize; private int batchSize;
private OutputFormat outputFormat;
private String outputPath; private String outputPath;
private SparkSession spark; private SparkSession spark;
@ -80,14 +87,22 @@ public class XmlIndexingJob {
final String outputPath = Optional final String outputPath = Optional
.ofNullable(parser.get("outputPath")) .ofNullable(parser.get("outputPath"))
.map(StringUtils::trim)
.orElse(null); .orElse(null);
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final Integer batchSize = parser.getObjectMap().containsKey("batchSize") final Integer batchSize = Optional
? Integer.valueOf(parser.get("batchSize")) .ofNullable(parser.get("batchSize"))
: DEFAULT_BATCH_SIZE; .map(Integer::valueOf)
.orElse(DEFAULT_BATCH_SIZE);
log.info("batchSize: {}", batchSize); log.info("batchSize: {}", batchSize);
final OutputFormat outputFormat = Optional
.ofNullable(parser.get("outputFormat"))
.map(OutputFormat::valueOf)
.orElse(OutputFormat.SOLR);
log.info("outputFormat: {}", outputFormat);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
conf.registerKryoClasses(new Class[] { conf.registerKryoClasses(new Class[] {
SerializableSolrInputDocument.class SerializableSolrInputDocument.class
@ -100,15 +115,18 @@ public class XmlIndexingJob {
final String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl); log.info("isLookupUrl: {}", isLookupUrl);
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl)); final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
new XmlIndexingJob(spark, inputPath, format, batchSize, outputPath).run(isLookup); new XmlIndexingJob(spark, inputPath, format, batchSize, outputFormat, outputPath).run(isLookup);
}); });
} }
public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize, String outputPath) { public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize,
OutputFormat outputFormat,
String outputPath) {
this.spark = spark; this.spark = spark;
this.inputPath = inputPath; this.inputPath = inputPath;
this.format = format; this.format = format;
this.batchSize = batchSize; this.batchSize = batchSize;
this.outputFormat = outputFormat;
this.outputPath = outputPath; this.outputPath = outputPath;
} }
@ -137,17 +155,22 @@ public class XmlIndexingJob {
.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s)) .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s)); .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s));
if (StringUtils.isNotBlank(outputPath)) { switch (outputFormat) {
spark case SOLR:
.createDataset( final String collection = ProvisionConstants.getCollectionName(format);
docs.map(s -> new SerializableSolrInputDocument(s)).rdd(), SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
Encoders.kryo(SerializableSolrInputDocument.class)) break;
.write() case HDFS:
.mode(SaveMode.Overwrite) spark
.parquet(outputPath); .createDataset(
} else { docs.map(s -> new SerializableSolrInputDocument(s)).rdd(),
final String collection = ProvisionConstants.getCollectionName(format); Encoders.kryo(SerializableSolrInputDocument.class))
SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd()); .write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
break;
default:
throw new IllegalArgumentException("invalid outputFormat: " + outputFormat);
} }
} }