updated index mapping to include orcid

This commit is contained in:
Sandro La Bruzzo 2023-04-27 10:00:34 +02:00
parent ab1842e5dc
commit 4f07dba68f
13 changed files with 410 additions and 304 deletions

View File

@ -1,158 +0,0 @@
package eu.dnetlib.dhp.sx.graph.scholix;
import java.util.List;
public class ScholixFlat {
private String identifier;
private String relationType;
private String sourceId;
private String sourceType;
private String sourceSubType;
private List<String> sourcePid;
private List<String> sourcePidType;
private List<String> sourcePublisher;
private String targetId;
private String targetType;
private String targetSubType;
private List<String> targetPid;
private List<String> targetPidType;
private List<String> targetPublisher;
private List<String> linkProviders;
private String publicationDate;
private String blob;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public String getRelationType() {
return relationType;
}
public void setRelationType(String relationType) {
this.relationType = relationType;
}
public String getSourceId() {
return sourceId;
}
public void setSourceId(String sourceId) {
this.sourceId = sourceId;
}
public String getSourceType() {
return sourceType;
}
public void setSourceType(String sourceType) {
this.sourceType = sourceType;
}
public String getSourceSubType() {
return sourceSubType;
}
public void setSourceSubType(String sourceSubType) {
this.sourceSubType = sourceSubType;
}
public List<String> getSourcePid() {
return sourcePid;
}
public void setSourcePid(List<String> sourcePid) {
this.sourcePid = sourcePid;
}
public List<String> getSourcePidType() {
return sourcePidType;
}
public void setSourcePidType(List<String> sourcePidType) {
this.sourcePidType = sourcePidType;
}
public List<String> getSourcePublisher() {
return sourcePublisher;
}
public void setSourcePublisher(List<String> sourcePublisher) {
this.sourcePublisher = sourcePublisher;
}
public String getTargetId() {
return targetId;
}
public void setTargetId(String targetId) {
this.targetId = targetId;
}
public String getTargetType() {
return targetType;
}
public void setTargetType(String targetType) {
this.targetType = targetType;
}
public String getTargetSubType() {
return targetSubType;
}
public void setTargetSubType(String targetSubType) {
this.targetSubType = targetSubType;
}
public List<String> getTargetPid() {
return targetPid;
}
public void setTargetPid(List<String> targetPid) {
this.targetPid = targetPid;
}
public List<String> getTargetPidType() {
return targetPidType;
}
public void setTargetPidType(List<String> targetPidType) {
this.targetPidType = targetPidType;
}
public List<String> getTargetPublisher() {
return targetPublisher;
}
public void setTargetPublisher(List<String> targetPublisher) {
this.targetPublisher = targetPublisher;
}
public List<String> getLinkProviders() {
return linkProviders;
}
public void setLinkProviders(List<String> linkProviders) {
this.linkProviders = linkProviders;
}
public String getPublicationDate() {
return publicationDate;
}
public void setPublicationDate(String publicationDate) {
this.publicationDate = publicationDate;
}
public String getBlob() {
return blob;
}
public void setBlob(String blob) {
this.blob = blob;
}
}

View File

@ -1,6 +1,14 @@
package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.oaf.{Dataset, OtherResearchProduct, Publication, Relation, Result, Software, StructuredProperty}
import eu.dnetlib.dhp.schema.oaf.{
Dataset,
OtherResearchProduct,
Publication,
Relation,
Result,
Software,
StructuredProperty
}
import eu.dnetlib.dhp.schema.sx.scholix._
import eu.dnetlib.dhp.schema.sx.summary.{AuthorPid, CollectedFromType, SchemeValue, ScholixSummary, Typology}
import eu.dnetlib.dhp.utils.DHPUtils
@ -263,14 +271,16 @@ object ScholixUtils extends Serializable {
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] =
summaryObject.getAuthor.asScala.map(a => {
summaryObject.getAuthor.asScala
.map(a => {
if (a.getORCID != null)
new ScholixEntityId(
a.getFullname,
List(new ScholixIdentifier(a.getORCID, "ORCID", s"https://orcid.org/${a.getORCID}")).asJava
)
else new ScholixEntityId(a.getFullname, null)
}).toList
})
.toList
if (l.nonEmpty)
r.setCreator(l.asJava)
}

View File

@ -11,6 +11,7 @@ import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.sx.scholix.Scholix;
import eu.dnetlib.dhp.schema.sx.scholix.ScholixFlat;
public class ScholixFlatTest {

View File

@ -0,0 +1,91 @@
package eu.dnetlib.dhp.sx.graph.dump;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.api.MissingConceptDoiException;
import eu.dnetlib.dhp.common.api.ZenodoAPIClient;
public class SendToZenodoHDFS implements Serializable {
private static final String NEW = "new"; // to be used for a brand new deposition in zenodo
private static final String VERSION = "version"; // to be used to upload a new version of a published deposition
private static final String UPDATE = "update"; // to upload content to an open deposition not published
public static void main(final String[] args) throws Exception, MissingConceptDoiException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SendToZenodoHDFS.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/graph/dump/upload_zenodo.json")));
parser.parseArgument(args);
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("nameNode");
final String access_token = parser.get("accessToken");
final String connection_url = parser.get("connectionUrl");
final String metadata = parser.get("metadata");
final String depositionType = parser.get("depositionType");
final String concept_rec_id = Optional
.ofNullable(parser.get("conceptRecordId"))
.orElse(null);
final String depositionId = Optional.ofNullable(parser.get("depositionId")).orElse(null);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(hdfsPath), true);
ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token);
switch (depositionType) {
case NEW:
zenodoApiClient.newDeposition();
break;
case VERSION:
if (concept_rec_id == null) {
throw new MissingConceptDoiException("No concept record id has been provided");
}
zenodoApiClient.newVersion(concept_rec_id);
break;
case UPDATE:
if (depositionId == null) {
throw new MissingConceptDoiException("No deposition id has been provided");
}
zenodoApiClient.uploadOpenDeposition(depositionId);
break;
default:
throw new RuntimeException("No available entries");
}
while (fileStatusListIterator.hasNext()) {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String pString = p.toString();
if (!pString.endsWith("_SUCCESS")) {
String name = pString.substring(pString.lastIndexOf("/") + 1);
FSDataInputStream inputStream = fileSystem.open(p);
// zenodoApiClient.uploadIS(inputStream, name);
}
}
if (!metadata.equals("")) {
zenodoApiClient.sendMretadata(metadata);
}
}
}

View File

@ -0,0 +1,50 @@
[
{
"paramName": "dt",
"paramLongName": "depositionType",
"paramDescription": "the type of the deposition (new, version, update)",
"paramRequired": true
},
{
"paramName": "cri",
"paramLongName": "conceptRecordId",
"paramDescription": "The id of the concept record for a new version",
"paramRequired": false
},
{
"paramName": "di",
"paramLongName": "depositionId",
"paramDescription": "the id of an open deposition which has not been published",
"paramRequired": false
},
{
"paramName": "hdfsp",
"paramLongName": "hdfsPath",
"paramDescription": "the path of the folder tofind files to send to Zenodo",
"paramRequired": true
},
{
"paramName": "nn",
"paramLongName": "nameNode",
"paramDescription": "the name node",
"paramRequired": true
},
{
"paramName": "at",
"paramLongName": "accessToken",
"paramDescription": "the access token for the deposition",
"paramRequired": false
},
{
"paramName": "cu",
"paramLongName": "connectionUrl",
"paramDescription": "the url to connect to deposit",
"paramRequired": false
},
{
"paramName": "m",
"paramLongName": "metadata",
"paramDescription": "metadata associated to the deposition",
"paramRequired": false
}
]

View File

@ -8,148 +8,36 @@
<name>targetPath</name>
<description>the final graph path</description>
</property>
<property>
<name>relationFilter</name>
<description>Filter relation semantic</description>
</property>
<property>
<name>maxNumberOfPid</name>
<description>filter relation with at least #maxNumberOfPid</description>
</property>
<property>
<name>dumpCitations</name>
<value>false</value>
<description>should dump citation relations</description>
</property>
</parameters>
<start to="ImportDatasetEntities"/>
<start to="make_tar"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ImportDatasetEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Import JSONRDD to Dataset kryo</name>
<class>eu.dnetlib.dhp.sx.graph.SparkConvertRDDtoDataset</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=3000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--filterRelation</arg><arg>${relationFilter}</arg>
</spark>
<ok to="CreateSummaries"/>
<error to="Kill"/>
</action>
<action name="CreateSummaries">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Convert Entities to summaries</name>
<class>eu.dnetlib.dhp.sx.graph.SparkCreateSummaryObject</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=20000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--sourcePath</arg><arg>${targetPath}/entities</arg>
<arg>--targetPath</arg><arg>${targetPath}/provision/summaries</arg>
</spark>
<ok to="CreateScholix"/>
<error to="Kill"/>
</action>
<action name="CreateScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Generate Scholix Dataset</name>
<class>eu.dnetlib.dhp.sx.graph.SparkCreateScholix</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=30000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg>
<arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg>
<arg>--relationPath</arg><arg>${targetPath}/relation</arg>
<arg>--dumpCitations</arg><arg>${dumpCitations}</arg>
</spark>
<ok to="DropJSONPath"/>
<error to="Kill"/>
</action>
<action name="DropJSONPath">
<fs>
<delete path='${targetPath}/json'/>
<mkdir path='${targetPath}/json/'/>
</fs>
<ok to="SerializeScholix"/>
<error to="Kill"/>
</action>
<action name="SerializeScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Serialize scholix to JSON</name>
<class>eu.dnetlib.dhp.sx.graph.SparkConvertObjectToJson</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=6000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--sourcePath</arg><arg>${targetPath}/provision/scholix/scholix</arg>
<arg>--targetPath</arg><arg>${targetPath}/json/scholix_json</arg>
<arg>--objectType</arg><arg>scholix</arg>
<arg>--maxPidNumberFilter</arg><arg>maxNumberOfPid</arg>
</spark>
<ok to="make_tar"/>
<error to="Kill"/>
</action>
<action name="make_tar">
<java>
<main-class>eu.dnetlib.dhp.common.MakeTarArchive</main-class>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--hdfsPath</arg><arg>${targetPath}</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
</java>
<ok to="send_zenodo"/>
<error to="Kill"/>
</action>
<action name="send_zenodo">
<java>
<main-class>eu.dnetlib.dhp.sx.graph.dump.SendToZenodoHDFS</main-class>
<arg>--hdfsPath</arg><arg>${targetPath}/tar</arg>
<arg>--sourcePath</arg><arg>${targetPath}/json</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--accessToken</arg><arg>${accessToken}</arg>
<arg>--connectionUrl</arg><arg>${connectionUrl}</arg>
<arg>--metadata</arg><arg>${metadata}</arg>
<arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
<arg>--depositionType</arg><arg>${depositionType}</arg>
<arg>--depositionId</arg><arg>${depositionId}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>

View File

@ -103,7 +103,7 @@ public class DropAndCreateESIndex {
Objects
.requireNonNull(
DropAndCreateESIndex.class
.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/scholix_index.json")));
.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/scholix_index_flat.json")));
log.info("creating Index SCHOLIX");
final HttpPut put = new HttpPut(String.format(url, ip, index, "scholix"));

View File

@ -42,7 +42,8 @@ public class SparkIndexCollectionOnES {
.toString(
Objects
.requireNonNull(
DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/cluster.json")));
SparkIndexCollectionOnES.class
.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/cluster.json")));
@SuppressWarnings("unchecked")
final Map<String, String> clusterMap = new ObjectMapper().readValue(clusterJson, Map.class);

View File

@ -14,7 +14,7 @@
</property>
</parameters>
<start to="DropAndCreateIndex"/>
<start to="indexScholix"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>

View File

@ -0,0 +1,77 @@
{
"mappings": {
"properties": {
"blob": {
"type": "binary",
"index": false
},
"identifier": {
"type": "keyword"
},
"linkProviders": {
"type": "keyword"
},
"publicationDate": {
"type": "date"
},
"relationType": {
"type": "keyword"
},
"sourceId": {
"type": "keyword"
},
"sourcePid": {
"type": "keyword"
},
"sourcePidType": {
"type": "keyword"
},
"sourcePublisher": {
"type": "keyword"
},
"sourceSubType": {
"type": "keyword"
},
"sourceType": {
"type": "keyword"
},
"targetId": {
"type": "keyword"
},
"targetPid": {
"type": "keyword"
},
"targetPidType": {
"type": "keyword"
},
"targetPublisher": {
"type": "keyword"
},
"targetSubType": {
"type": "keyword"
},
"targetType": {
"type": "keyword"
}
}
},
"settings": {
"index": {
"refresh_interval": "600s",
"number_of_shards": "48",
"translog": {
"sync_interval": "15s",
"durability": "ASYNC"
},
"analysis": {
"analyzer": {
"analyzer_keyword": {
"filter": "lowercase",
"tokenizer": "keyword"
}
}
},
"number_of_replicas": "0"
}
}
}

View File

@ -0,0 +1,146 @@
package eu.dnetlib.dhp.sx;
import static eu.dnetlib.dhp.sx.provision.DropAndCreateESIndex.APPLICATION_JSON;
import static eu.dnetlib.dhp.sx.provision.DropAndCreateESIndex.STATUS_CODE_TEXT;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Base64;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.codec.binary.Base64InputStream;
import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpDelete;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.sx.scholix.Scholix;
import eu.dnetlib.dhp.schema.sx.scholix.ScholixFlat;
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils;
import eu.dnetlib.dhp.sx.provision.DropAndCreateESIndex;
import eu.dnetlib.dhp.sx.provision.SparkIndexCollectionOnES;
public class FlatIndexTest {
@Test
public void dropAndCreateIndex() {
Logger log = LoggerFactory.getLogger(getClass().getName());
final String url = "http://localhost:9200/dli_scholix";
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpDelete delete = new HttpDelete(url);
CloseableHttpResponse response = client.execute(delete);
log.info("deleting Index SCHOLIX");
log.info(STATUS_CODE_TEXT, response.getStatusLine());
} catch (IOException e) {
throw new RuntimeException(e);
}
try (CloseableHttpClient client = HttpClients.createDefault()) {
final String scholixConf = IOUtils
.toString(
Objects
.requireNonNull(
DropAndCreateESIndex.class
.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/scholix_index_flat.json")));
log.info("creating Index SCHOLIX");
final HttpPut put = new HttpPut(url);
final StringEntity entity = new StringEntity(scholixConf);
put.setEntity(entity);
put.setHeader("Accept", APPLICATION_JSON);
put.setHeader("Content-type", APPLICATION_JSON);
final CloseableHttpResponse response = client.execute(put);
log.info(STATUS_CODE_TEXT, response.getStatusLine());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private String compressString(final String input) {
try {
ByteArrayOutputStream os = new ByteArrayOutputStream();
Base64OutputStream b64os = new Base64OutputStream(os);
GZIPOutputStream gzip = new GZIPOutputStream(b64os);
gzip.write(input.getBytes(StandardCharsets.UTF_8));
gzip.close();
b64os.close();
return new String(os.toByteArray(), StandardCharsets.UTF_8);
} catch (Throwable t) {
t.printStackTrace();
return null;
}
}
private String uncompress(final String compressed) throws Exception {
Base64InputStream bis = new Base64InputStream(new ByteArrayInputStream(compressed.getBytes()));
GZIPInputStream gzip = new GZIPInputStream(bis);
return IOUtils.toString(gzip);
}
public void testFeedIndex() throws Exception {
InputStream gzipStream = new GZIPInputStream(
getClass().getResourceAsStream("/eu/dnetlib/dhp/sx/provision/scholix_dump.gz"));
Reader decoder = new InputStreamReader(gzipStream, "UTF-8");
BufferedReader buffered = new BufferedReader(decoder);
final ObjectMapper mapper = new ObjectMapper();
GZIPOutputStream gzip = new GZIPOutputStream(Files.newOutputStream(Paths.get("/tmp/scholix_flat.gz")));
String line = buffered.readLine();
while (line != null) {
final Scholix s = mapper.readValue(line, Scholix.class);
final ScholixFlat flat = ScholixUtils.flattenizeScholix(s, compressString(line));
gzip.write(mapper.writeValueAsString(flat).concat("\n").getBytes(StandardCharsets.UTF_8));
line = buffered.readLine();
}
gzip.close();
SparkConf conf = new SparkConf()
.setAppName(SparkIndexCollectionOnES.class.getSimpleName())
.setMaster("local[*]");
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
try (final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
JavaRDD<String> inputRdd = sc.textFile("/tmp/scholix_flat.gz");
Map<String, String> esCfg = new HashMap<>();
esCfg.put("es.nodes", "localhost");
esCfg.put("es.mapping.id", "identifier");
esCfg.put("es.batch.write.retry.count", "8");
esCfg.put("es.batch.write.retry.wait", "60s");
esCfg.put("es.batch.size.entries", "200");
esCfg.put("es.nodes.wan.only", "true");
JavaEsSpark.saveJsonToEs(inputRdd, "scholix", esCfg);
}
}
}