[Organization] dump for organizations only
This commit is contained in:
parent
3ad0d6edfc
commit
e4b56a4f88
|
@ -1,6 +1,5 @@
|
|||
import java.io.IOException;
|
||||
|
||||
import eu.dnetlib.dhp.oa.model.Result;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
|
@ -10,6 +9,7 @@ import com.github.imifou.jsonschema.module.addon.AddonModule;
|
|||
import com.github.victools.jsonschema.generator.*;
|
||||
|
||||
import eu.dnetlib.dhp.ExecCreateSchemas;
|
||||
import eu.dnetlib.dhp.oa.model.Result;
|
||||
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.oa.model.graph.*;
|
||||
|
||||
|
|
|
@ -9,6 +9,8 @@ import java.util.Objects;
|
|||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.amazonaws.transform.SimpleTypeUnmarshallers;
|
||||
import io.netty.util.internal.StringUtil;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
|
@ -95,6 +97,8 @@ public class SparkDumpFunderResults implements Serializable {
|
|||
Optional<Funder> ofunder = Optional.ofNullable(p.getFunder());
|
||||
if (ofunder.isPresent()) {
|
||||
String fName = ofunder.get().getShortName();
|
||||
if(StringUtil.isNullOrEmpty(fName))
|
||||
return ofunder.get().getName();
|
||||
if (fName.equalsIgnoreCase("ec")) {
|
||||
fName += "_" + ofunder.get().getFundingStream();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,270 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.organizationonly;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
|
||||
import eu.dnetlib.dhp.oa.model.Container;
|
||||
import eu.dnetlib.dhp.oa.model.Provenance;
|
||||
import eu.dnetlib.dhp.oa.model.Result;
|
||||
import eu.dnetlib.dhp.oa.model.graph.*;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Datasource;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Organization;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Project;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Spark Job that fires the dump for the entites
|
||||
*/
|
||||
public class SparkDumpOrganizationJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(eu.dnetlib.dhp.oa.graph.dump.organizationonly.SparkDumpOrganizationJob.class);
|
||||
public static final String COMPRESSION = "compression";
|
||||
public static final String GZIP = "gzip";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
Boolean isSparkSessionManaged = Boolean.TRUE;
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = "/tmp/prod_provision/graph/20_graph_blacklisted/";
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = "/tmp/miriam/organizationsOnly/";
|
||||
log.info("outputPath: {}", outputPath);
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
// Utils.removeOutputDir(spark, outputPath);
|
||||
organizationMap(spark, inputPath, outputPath);
|
||||
// relationMap2(spark, inputPath, outputPath);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void relationMap2(SparkSession spark, String inputPath, String outputPath) {
|
||||
Utils
|
||||
.readPath(spark, inputPath + "relation", Relation.class)
|
||||
.filter((FilterFunction<Relation>) r -> r.getRelType().equalsIgnoreCase("organizationOrganization"))
|
||||
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
|
||||
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
|
||||
relNew
|
||||
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setReltype(
|
||||
RelType
|
||||
.newInstance(
|
||||
relation.getRelClass(),
|
||||
relation.getSubRelType()));
|
||||
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
|
||||
if (odInfo.isPresent()) {
|
||||
DataInfo dInfo = odInfo.get();
|
||||
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
|
||||
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
|
||||
relNew
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
dInfo.getProvenanceaction().getClassname(),
|
||||
dInfo.getTrust()));
|
||||
}
|
||||
}
|
||||
if (Boolean.TRUE.equals(relation.getValidated())) {
|
||||
relNew.setValidated(relation.getValidated());
|
||||
relNew.setValidationDate(relation.getValidationDate());
|
||||
}
|
||||
|
||||
return relNew;
|
||||
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "relation");
|
||||
}
|
||||
|
||||
private static void relationMap(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<eu.dnetlib.dhp.schema.oaf.Organization> organization = Utils
|
||||
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class);
|
||||
Dataset<Relation> rels = Utils.readPath(spark, inputPath + "relation", Relation.class);
|
||||
organization
|
||||
.joinWith(rels, organization.col("id").equalTo(rels.col("source")), "left")
|
||||
.map(
|
||||
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
|
||||
Encoders.bean(Relation.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json("/tmp/orgSource");
|
||||
|
||||
rels = Utils.readPath(spark, "/tmp/orgSource", Relation.class);
|
||||
|
||||
organization
|
||||
.joinWith(rels, organization.col("id").equalTo(rels.col("target")), "left")
|
||||
.map(
|
||||
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
|
||||
Encoders.bean(Relation.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json("/tmp/orgSourceTarget");
|
||||
|
||||
Utils
|
||||
.readPath(spark, "/tmp/orgSourceTarget", Relation.class)
|
||||
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
|
||||
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
|
||||
relNew
|
||||
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setReltype(
|
||||
RelType
|
||||
.newInstance(
|
||||
relation.getRelClass(),
|
||||
relation.getSubRelType()));
|
||||
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
|
||||
if (odInfo.isPresent()) {
|
||||
DataInfo dInfo = odInfo.get();
|
||||
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
|
||||
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
|
||||
relNew
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
dInfo.getProvenanceaction().getClassname(),
|
||||
dInfo.getTrust()));
|
||||
}
|
||||
}
|
||||
if (Boolean.TRUE.equals(relation.getValidated())) {
|
||||
relNew.setValidated(relation.getValidated());
|
||||
relNew.setValidationDate(relation.getValidationDate());
|
||||
}
|
||||
|
||||
return relNew;
|
||||
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "relation");
|
||||
}
|
||||
|
||||
private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
|
||||
Utils
|
||||
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
|
||||
.map(
|
||||
(MapFunction<eu.dnetlib.dhp.schema.oaf.Organization, Organization>) o -> mapOrganization(o),
|
||||
Encoders.bean(Organization.class))
|
||||
.filter((FilterFunction<Organization>) o -> o != null)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option(COMPRESSION, GZIP)
|
||||
.json(outputPath + "/organization");
|
||||
}
|
||||
|
||||
private static eu.dnetlib.dhp.oa.model.graph.Organization mapOrganization(
|
||||
eu.dnetlib.dhp.schema.oaf.Organization org) {
|
||||
|
||||
Organization organization = new Organization();
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getLegalshortname())
|
||||
.ifPresent(value -> organization.setLegalshortname(value.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getLegalname())
|
||||
.ifPresent(value -> organization.setLegalname(value.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getWebsiteurl())
|
||||
.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getAlternativeNames())
|
||||
.ifPresent(
|
||||
value -> organization
|
||||
.setAlternativenames(
|
||||
value
|
||||
.stream()
|
||||
.map(v -> v.getValue())
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getCountry())
|
||||
.ifPresent(
|
||||
value -> {
|
||||
if (!value.getClassid().equals(eu.dnetlib.dhp.oa.graph.dump.complete.Constants.UNKNOWN)) {
|
||||
organization
|
||||
.setCountry(
|
||||
eu.dnetlib.dhp.oa.model.Country.newInstance(value.getClassid(), value.getClassname()));
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getId())
|
||||
.ifPresent(value -> organization.setId(getEntityId(value, ENTITY_ID_SEPARATOR)));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getPid())
|
||||
.ifPresent(
|
||||
value -> organization
|
||||
.setPid(
|
||||
value
|
||||
.stream()
|
||||
.map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
return organization;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,88 @@
|
|||
<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
<start to="dump_organization"/>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<action name="dump_organization">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table organization and related relations </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.organizationonly.SparkDumpOrganizationJob</class>
|
||||
<jar>dump-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/project</arg>
|
||||
<arg>--communityMapPath</arg><arg>noneed</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
Loading…
Reference in New Issue