[Organization] dump for organizations only

This commit is contained in:
Miriam Baglioni 2024-02-06 14:13:12 +02:00
parent 3ad0d6edfc
commit e4b56a4f88
5 changed files with 393 additions and 1 deletions

View File

@ -1,6 +1,5 @@
import java.io.IOException;
import eu.dnetlib.dhp.oa.model.Result;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException;
@ -10,6 +9,7 @@ import com.github.imifou.jsonschema.module.addon.AddonModule;
import com.github.victools.jsonschema.generator.*;
import eu.dnetlib.dhp.ExecCreateSchemas;
import eu.dnetlib.dhp.oa.model.Result;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import eu.dnetlib.dhp.oa.model.graph.*;

View File

@ -9,6 +9,8 @@ import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import com.amazonaws.transform.SimpleTypeUnmarshallers;
import io.netty.util.internal.StringUtil;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
@ -95,6 +97,8 @@ public class SparkDumpFunderResults implements Serializable {
Optional<Funder> ofunder = Optional.ofNullable(p.getFunder());
if (ofunder.isPresent()) {
String fName = ofunder.get().getShortName();
if(StringUtil.isNullOrEmpty(fName))
return ofunder.get().getName();
if (fName.equalsIgnoreCase("ec")) {
fName += "_" + ofunder.get().getFundingStream();
}

View File

@ -0,0 +1,270 @@
package eu.dnetlib.dhp.oa.graph.dump.organizationonly;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
import java.io.Serializable;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
import eu.dnetlib.dhp.oa.model.Container;
import eu.dnetlib.dhp.oa.model.Provenance;
import eu.dnetlib.dhp.oa.model.Result;
import eu.dnetlib.dhp.oa.model.graph.*;
import eu.dnetlib.dhp.oa.model.graph.Datasource;
import eu.dnetlib.dhp.oa.model.graph.Organization;
import eu.dnetlib.dhp.oa.model.graph.Project;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
/**
* Spark Job that fires the dump for the entites
*/
public class SparkDumpOrganizationJob implements Serializable {
private static final Logger log = LoggerFactory
.getLogger(eu.dnetlib.dhp.oa.graph.dump.organizationonly.SparkDumpOrganizationJob.class);
public static final String COMPRESSION = "compression";
public static final String GZIP = "gzip";
public static void main(String[] args) throws Exception {
Boolean isSparkSessionManaged = Boolean.TRUE;
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = "/tmp/prod_provision/graph/20_graph_blacklisted/";
log.info("inputPath: {}", inputPath);
final String outputPath = "/tmp/miriam/organizationsOnly/";
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
// Utils.removeOutputDir(spark, outputPath);
organizationMap(spark, inputPath, outputPath);
// relationMap2(spark, inputPath, outputPath);
});
}
private static void relationMap2(SparkSession spark, String inputPath, String outputPath) {
Utils
.readPath(spark, inputPath + "relation", Relation.class)
.filter((FilterFunction<Relation>) r -> r.getRelType().equalsIgnoreCase("organizationOrganization"))
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
relNew
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
relNew
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
relNew
.setReltype(
RelType
.newInstance(
relation.getRelClass(),
relation.getSubRelType()));
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
if (odInfo.isPresent()) {
DataInfo dInfo = odInfo.get();
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
relNew
.setProvenance(
Provenance
.newInstance(
dInfo.getProvenanceaction().getClassname(),
dInfo.getTrust()));
}
}
if (Boolean.TRUE.equals(relation.getValidated())) {
relNew.setValidated(relation.getValidated());
relNew.setValidationDate(relation.getValidationDate());
}
return relNew;
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "relation");
}
private static void relationMap(SparkSession spark, String inputPath, String outputPath) {
Dataset<eu.dnetlib.dhp.schema.oaf.Organization> organization = Utils
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class);
Dataset<Relation> rels = Utils.readPath(spark, inputPath + "relation", Relation.class);
organization
.joinWith(rels, organization.col("id").equalTo(rels.col("source")), "left")
.map(
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
Encoders.bean(Relation.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json("/tmp/orgSource");
rels = Utils.readPath(spark, "/tmp/orgSource", Relation.class);
organization
.joinWith(rels, organization.col("id").equalTo(rels.col("target")), "left")
.map(
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
Encoders.bean(Relation.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json("/tmp/orgSourceTarget");
Utils
.readPath(spark, "/tmp/orgSourceTarget", Relation.class)
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
relNew
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
relNew
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
relNew
.setReltype(
RelType
.newInstance(
relation.getRelClass(),
relation.getSubRelType()));
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
if (odInfo.isPresent()) {
DataInfo dInfo = odInfo.get();
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
relNew
.setProvenance(
Provenance
.newInstance(
dInfo.getProvenanceaction().getClassname(),
dInfo.getTrust()));
}
}
if (Boolean.TRUE.equals(relation.getValidated())) {
relNew.setValidated(relation.getValidated());
relNew.setValidationDate(relation.getValidationDate());
}
return relNew;
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "relation");
}
private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
Utils
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
.map(
(MapFunction<eu.dnetlib.dhp.schema.oaf.Organization, Organization>) o -> mapOrganization(o),
Encoders.bean(Organization.class))
.filter((FilterFunction<Organization>) o -> o != null)
.write()
.mode(SaveMode.Overwrite)
.option(COMPRESSION, GZIP)
.json(outputPath + "/organization");
}
private static eu.dnetlib.dhp.oa.model.graph.Organization mapOrganization(
eu.dnetlib.dhp.schema.oaf.Organization org) {
Organization organization = new Organization();
Optional
.ofNullable(org.getLegalshortname())
.ifPresent(value -> organization.setLegalshortname(value.getValue()));
Optional
.ofNullable(org.getLegalname())
.ifPresent(value -> organization.setLegalname(value.getValue()));
Optional
.ofNullable(org.getWebsiteurl())
.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
Optional
.ofNullable(org.getAlternativeNames())
.ifPresent(
value -> organization
.setAlternativenames(
value
.stream()
.map(v -> v.getValue())
.collect(Collectors.toList())));
Optional
.ofNullable(org.getCountry())
.ifPresent(
value -> {
if (!value.getClassid().equals(eu.dnetlib.dhp.oa.graph.dump.complete.Constants.UNKNOWN)) {
organization
.setCountry(
eu.dnetlib.dhp.oa.model.Country.newInstance(value.getClassid(), value.getClassname()));
}
});
Optional
.ofNullable(org.getId())
.ifPresent(value -> organization.setId(getEntityId(value, ENTITY_ID_SEPARATOR)));
Optional
.ofNullable(org.getPid())
.ifPresent(
value -> organization
.setPid(
value
.stream()
.map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
return organization;
}
}

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,88 @@
<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="dump_organization"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="dump_organization">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table organization and related relations </name>
<class>eu.dnetlib.dhp.oa.graph.dump.organizationonly.SparkDumpOrganizationJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--outputPath</arg><arg>${workingDir}/project</arg>
<arg>--communityMapPath</arg><arg>noneed</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>