diff --git a/dhp-doc-resources/img/data_provision_workflow.png b/dhp-doc-resources/img/data_provision_workflow.png new file mode 100644 index 0000000000..31979fbb49 Binary files /dev/null and b/dhp-doc-resources/img/data_provision_workflow.png differ diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index d8873d33d0..89e52858be 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -30,6 +30,12 @@ com.fasterxml.jackson.core jackson-databind + + + junit + junit + ${junit.version} + eu.dnetlib.dhp diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java index 5cf0883beb..24a363bec2 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java @@ -1,66 +1,83 @@ package eu.dnetlib.dhp.schema.oaf; +import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.junit.Assert; public class Relation extends Oaf { - private String relType; + private String relType; - private String subRelType; + private String subRelType; - private String relClass; + private String relClass; - private String source; + private String source; - private String target; + private String target; - private List collectedFrom; + private List collectedFrom = new ArrayList<>(); - public String getRelType() { - return relType; - } + public String getRelType() { + return relType; + } - public void setRelType(String relType) { - this.relType = relType; - } + public void setRelType(final String relType) { + this.relType = relType; + } - public String getSubRelType() { - return subRelType; - } + public String getSubRelType() { + return subRelType; + } - public void setSubRelType(String subRelType) { - this.subRelType = subRelType; - } + public void setSubRelType(final String subRelType) { + this.subRelType = subRelType; + } - public String getRelClass() { - return relClass; - } + public String getRelClass() { + return relClass; + } - public void setRelClass(String relClass) { - this.relClass = relClass; - } + public void setRelClass(final String relClass) { + this.relClass = relClass; + } - public String getSource() { - return source; - } + public String getSource() { + return source; + } - public void setSource(String source) { - this.source = source; - } + public void setSource(final String source) { + this.source = source; + } - public String getTarget() { - return target; - } + public String getTarget() { + return target; + } - public void setTarget(String target) { - this.target = target; - } + public void setTarget(final String target) { + this.target = target; + } - public List getCollectedFrom() { - return collectedFrom; - } + public List getCollectedFrom() { + return collectedFrom; + } + + public void setCollectedFrom(final List collectedFrom) { + this.collectedFrom = collectedFrom; + } + + public void mergeFrom(final Relation r) { + Assert.assertEquals("source ids must be equal", getSource(), r.getSource()); + Assert.assertEquals("target ids must be equal", getTarget(), r.getTarget()); + Assert.assertEquals("relType(s) must be equal", getRelType(), r.getRelType()); + Assert.assertEquals("subRelType(s) must be equal", getSubRelType(), r.getSubRelType()); + Assert.assertEquals("relClass(es) must be equal", getRelClass(), r.getRelClass()); + setCollectedFrom(Stream.concat(getCollectedFrom().stream(), r.getCollectedFrom().stream()) + .distinct() // relies on KeyValue.equals + .collect(Collectors.toList())); + } - public void setCollectedFrom(List collectedFrom) { - this.collectedFrom = collectedFrom; - } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index 5cb04da5eb..b6c73e84a6 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -4,7 +4,7 @@ import java.io.Serializable; import java.util.Comparator; import java.util.List; -public abstract class Result extends OafEntity implements Serializable { +public class Result extends OafEntity implements Serializable { private List author; diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 78831073fd..09dac83499 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -24,6 +24,12 @@ eu.dnetlib.dhp dhp-common ${project.version} + + + com.sun.xml.bind + jaxb-core + + @@ -32,6 +38,49 @@ ${project.version} + + eu.dnetlib + dnet-actionmanager-common + + + eu.dnetlib + dnet-openaireplus-mapping-utils + + + saxonica + saxon + + + saxonica + saxon-dom + + + jgrapht + jgrapht + + + net.sf.ehcache + ehcache + + + org.springframework + spring-test + + + org.apache.* + * + + + apache + * + + + + + eu.dnetlib + dnet-openaire-data-protos + + net.sf.saxon Saxon-HE @@ -55,6 +104,11 @@ org.mongodb mongo-java-driver + + + org.apache.hadoop + hadoop-distcp + org.postgresql diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java deleted file mode 100644 index f2d9caebf6..0000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java +++ /dev/null @@ -1,56 +0,0 @@ -package eu.dnetlib.dhp.migration; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.io.Text; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; -import scala.Tuple2; - -import java.util.Arrays; -import java.util.List; - -public class ExtractEntitiesFromHDFSJob { - - - private static List folderNames = Arrays.asList("db_entities", "oaf_entities", "odf_entities"); - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json"))); - parser.parseArgument(args); - - final SparkSession spark = SparkSession - .builder() - .appName(ExtractEntitiesFromHDFSJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final String sourcePath = parser.get("sourcePath"); - final String targetPath = parser.get("graphRawPath"); - final String entity = parser.get("entity"); - - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - - JavaRDD inputRdd = sc.emptyRDD(); - - - folderNames.forEach(p -> inputRdd.union( - sc.sequenceFile(sourcePath+"/"+p, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .filter(k -> isEntityType(k._1(), entity)) - .map(Tuple2::_2)) - ); - - inputRdd.saveAsTextFile(targetPath+"/"+entity); - } - - - private static boolean isEntityType(final String item, final String entity) { - return StringUtils.substringAfter(item, ":").equalsIgnoreCase(entity); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java deleted file mode 100644 index 359fe7596b..0000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java +++ /dev/null @@ -1,45 +0,0 @@ -package eu.dnetlib.dhp.migration; - -import org.apache.commons.io.IOUtils; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -public class MigrateMongoMdstoresApplication { - - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json"))); - parser.parseArgument(args); - - final String mongoBaseUrl = parser.get("mongoBaseUrl"); - final String mongoDb = parser.get("mongoDb"); - - final String mdFormat = parser.get("mdFormat"); - final String mdLayout = parser.get("mdLayout"); - final String mdInterpretation = parser.get("mdInterpretation"); - - final String hdfsPath = parser.get("hdfsPath"); - final String hdfsNameNode = parser.get("namenode"); - final String hdfsUser = parser.get("hdfsUser"); - - final String dbUrl = parser.get("postgresUrl"); - final String dbUser = parser.get("postgresUser"); - final String dbPassword = parser.get("postgresPassword"); - - if (mdFormat.equalsIgnoreCase("oaf")) { - try (final OafMigrationExecutor mig = - new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { - mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); - } - } else if (mdFormat.equalsIgnoreCase("odf")) { - try (final OdfMigrationExecutor mig = - new OdfMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { - mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); - } - } else { - throw new RuntimeException("Format not supported: " + mdFormat); - } - - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java new file mode 100644 index 0000000000..9d0e82aca9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java @@ -0,0 +1,49 @@ +package eu.dnetlib.dhp.migration.actions; + +import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; + +import java.util.Comparator; + +public class LicenseComparator implements Comparator { + + @Override + public int compare(Qualifier left, Qualifier right) { + + if (left == null && right == null) return 0; + if (left == null) return 1; + if (right == null) return -1; + + String lClass = left.getClassid(); + String rClass = right.getClassid(); + + if (lClass.equals(rClass)) return 0; + + if (lClass.equals("OPEN SOURCE")) return -1; + if (rClass.equals("OPEN SOURCE")) return 1; + + if (lClass.equals("OPEN")) return -1; + if (rClass.equals("OPEN")) return 1; + + if (lClass.equals("6MONTHS")) return -1; + if (rClass.equals("6MONTHS")) return 1; + + if (lClass.equals("12MONTHS")) return -1; + if (rClass.equals("12MONTHS")) return 1; + + if (lClass.equals("EMBARGO")) return -1; + if (rClass.equals("EMBARGO")) return 1; + + if (lClass.equals("RESTRICTED")) return -1; + if (rClass.equals("RESTRICTED")) return 1; + + if (lClass.equals("CLOSED")) return -1; + if (rClass.equals("CLOSED")) return 1; + + if (lClass.equals("UNKNOWN")) return -1; + if (rClass.equals("UNKNOWN")) return 1; + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java new file mode 100644 index 0000000000..487fac359c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java @@ -0,0 +1,170 @@ +package eu.dnetlib.dhp.migration.actions; + +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.tools.DistCp; +import org.apache.hadoop.tools.DistCpOptions; +import org.apache.hadoop.util.ToolRunner; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.util.*; +import java.util.stream.Collectors; + +public class MigrateActionSet { + + private static final Log log = LogFactory.getLog(MigrateActionSet.class); + + private static final String SEPARATOR = "/"; + private static final String TARGET_PATHS = "target_paths"; + private static final String RAWSET_PREFIX = "rawset_"; + + private static Boolean DEFAULT_TRANSFORM_ONLY = false; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateActionSet.class.getResourceAsStream( + "/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json"))); + parser.parseArgument(args); + + new MigrateActionSet().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws Exception { + + final String isLookupUrl = parser.get("isLookupUrl"); + final String sourceNN = parser.get("sourceNameNode"); + final String targetNN = parser.get("targetNameNode"); + final String workDir = parser.get("workingDirectory"); + final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps")); + + final String distcp_memory_mb = parser.get("distcp_memory_mb"); + final String distcp_task_timeout = parser.get("distcp_task_timeout"); + + final String transform_only_s = parser.get("transform_only"); + + log.info("transform only param: " + transform_only_s); + + final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only")); + + log.info("transform only: " + transformOnly); + + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + + Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + FileSystem targetFS = FileSystem.get(conf); + + Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN); + FileSystem sourceFS = FileSystem.get(sourceConf); + + Properties props = new Properties(); + + List targetPaths = new ArrayList<>(); + + final List sourcePaths = getSourcePaths(sourceNN, isLookUp); + log.info(String.format("paths to process:\n%s", sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n")))); + for(Path source : sourcePaths) { + + if (!sourceFS.exists(source)) { + log.warn(String.format("skipping unexisting path: %s", source)); + } else { + + LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath())); + + final String rawSet = pathQ.pollLast(); + log.info(String.format("got RAWSET: %s", rawSet)); + + if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) { + + final String actionSetDirectory = pathQ.pollLast(); + + final Path targetPath = new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet); + + log.info(String.format("using TARGET PATH: %s", targetPath)); + + if (!transformOnly) { + if (targetFS.exists(targetPath)) { + targetFS.delete(targetPath, true); + } + runDistcp(distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath); + } + + targetPaths.add(targetPath); + } + } + } + + props.setProperty(TARGET_PATHS, targetPaths + .stream() + .map(p -> p.toString()) + .collect(Collectors.joining(","))); + File file = new File(System.getProperty("oozie.action.output.properties")); + + try(OutputStream os = new FileOutputStream(file)) { + props.store(os, ""); + } + System.out.println(file.getAbsolutePath()); + } + + private void runDistcp(Integer distcp_num_maps, String distcp_memory_mb, String distcp_task_timeout, Configuration conf, Path source, Path targetPath) throws Exception { + + final DistCpOptions op = new DistCpOptions(source, targetPath); + op.setMaxMaps(distcp_num_maps); + op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE); + op.preserve(DistCpOptions.FileAttribute.REPLICATION); + op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE); + + int res = ToolRunner.run(new DistCp(conf, op), new String[]{ + "-Dmapred.task.timeout=" + distcp_task_timeout, + "-Dmapreduce.map.memory.mb=" + distcp_memory_mb, + "-pb", + "-m " + distcp_num_maps, + source.toString(), + targetPath.toString()}); + + if (res != 0) { + throw new RuntimeException(String.format("distcp exited with code %s", res)); + } + } + + private Configuration getConfiguration(String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) { + final Configuration conf = new Configuration(); + conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout); + conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout); + conf.set("dfs.http.client.retry.policy.enabled", "true"); + conf.set("mapred.task.timeout", distcp_task_timeout); + conf.set("mapreduce.map.memory.mb", distcp_memory_mb); + conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps)); + return conf; + } + + private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) throws ISLookUpException { + String XQUERY = "distinct-values(\n" + + "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" + + "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" + + "let $setDir := $x//SET/@directory/string()\n" + + "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" + + "return concat($basePath, '/', $setDir, '/', $rawSet))"; + + log.info(String.format("running xquery:\n%s", XQUERY)); + return isLookUp.quickSearchProfile(XQUERY) + .stream() + .map(p -> sourceNN + p) + .map(Path::new) + .collect(Collectors.toList()); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java new file mode 100644 index 0000000000..a7e70ee813 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java @@ -0,0 +1,580 @@ +package eu.dnetlib.dhp.migration.actions; + +import com.google.common.collect.Lists; +import com.googlecode.protobuf.format.JsonFormat; +import eu.dnetlib.data.proto.*; +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.commons.lang3.StringUtils; + +import java.io.Serializable; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +public class ProtoConverter implements Serializable { + + public static final String UNKNOWN = "UNKNOWN"; + public static final String NOT_AVAILABLE = "not available"; + public static final String DNET_ACCESS_MODES = "dnet:access_modes"; + + public static Oaf convert(OafProtos.Oaf oaf) { + try { + switch (oaf.getKind()) { + case entity: + return convertEntity(oaf); + case relation: + return convertRelation(oaf); + default: + throw new IllegalArgumentException("invalid kind " + oaf.getKind()); + } + } catch (Throwable e) { + throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e); + } + } + + private static Relation convertRelation(OafProtos.Oaf oaf) { + final OafProtos.OafRel r = oaf.getRel(); + final Relation rel = new Relation(); + rel.setDataInfo(mapDataInfo(oaf.getDataInfo())); + rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp()); + rel.setSource(r.getSource()); + rel.setTarget(r.getTarget()); + rel.setRelType(r.getRelType().toString()); + rel.setSubRelType(r.getSubRelType().toString()); + rel.setRelClass(r.getRelClass()); + rel.setCollectedFrom(r.getCollectedfromCount() > 0 ? + r.getCollectedfromList().stream() + .map(kv -> mapKV(kv)) + .collect(Collectors.toList()) : null); + return rel; + } + + private static OafEntity convertEntity(OafProtos.Oaf oaf) { + + switch (oaf.getEntity().getType()) { + case result: + final Result r = convertResult(oaf); + r.setInstance(convertInstances(oaf)); + return r; + case project: + return convertProject(oaf); + case datasource: + return convertDataSource(oaf); + case organization: + return convertOrganization(oaf); + default: + throw new RuntimeException("received unknown type"); + } + } + + private static List convertInstances(OafProtos.Oaf oaf) { + + final ResultProtos.Result r = oaf.getEntity().getResult(); + if (r.getInstanceCount() > 0) { + return r.getInstanceList() + .stream() + .map(i -> convertInstance(i)) + .collect(Collectors.toList()); + } + return Lists.newArrayList(); + } + + private static Instance convertInstance(ResultProtos.Result.Instance ri) { + final Instance i = new Instance(); + i.setAccessright(mapQualifier(ri.getAccessright())); + i.setCollectedfrom(mapKV(ri.getCollectedfrom())); + i.setDateofacceptance(mapStringField(ri.getDateofacceptance())); + i.setDistributionlocation(ri.getDistributionlocation()); + i.setHostedby(mapKV(ri.getHostedby())); + i.setInstancetype(mapQualifier(ri.getInstancetype())); + i.setLicense(mapStringField(ri.getLicense())); + i.setUrl(ri.getUrlList()); + i.setRefereed(mapStringField(ri.getRefereed())); + i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount())); + i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency())); + return i; + } + + private static Organization convertOrganization(OafProtos.Oaf oaf) { + final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata(); + final Organization org = setOaf(new Organization(), oaf); + setEntity(org, oaf); + org.setLegalshortname(mapStringField(m.getLegalshortname())); + org.setLegalname(mapStringField(m.getLegalname())); + org.setAlternativeNames(m.getAlternativeNamesList(). + stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + org.setWebsiteurl(mapStringField(m.getWebsiteurl())); + org.setLogourl(mapStringField(m.getLogourl())); + org.setEclegalbody(mapStringField(m.getEclegalbody())); + org.setEclegalperson(mapStringField(m.getEclegalperson())); + org.setEcnonprofit(mapStringField(m.getEcnonprofit())); + org.setEcresearchorganization(mapStringField(m.getEcresearchorganization())); + org.setEchighereducation(mapStringField(m.getEchighereducation())); + org.setEcinternationalorganizationeurinterests(mapStringField(m.getEcinternationalorganizationeurinterests())); + org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization())); + org.setEcenterprise(mapStringField(m.getEcenterprise())); + org.setEcsmevalidated(mapStringField(m.getEcsmevalidated())); + org.setEcnutscode(mapStringField(m.getEcnutscode())); + org.setCountry(mapQualifier(m.getCountry())); + + return org; + } + + private static Datasource convertDataSource(OafProtos.Oaf oaf) { + final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata(); + final Datasource datasource = setOaf(new Datasource(), oaf); + setEntity(datasource, oaf); + datasource.setAccessinfopackage(m.getAccessinfopackageList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setCertificates(mapStringField(m.getCertificates())); + datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); + datasource.setContactemail(mapStringField(m.getContactemail())); + datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction())); + datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype())); + datasource.setDataprovider(mapBoolField(m.getDataprovider())); + datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype())); + datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction())); + datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); + datasource.setDatauploadtype(mapStringField(m.getDatauploadtype())); + datasource.setDateofvalidation(mapStringField(m.getDateofvalidation())); + datasource.setDescription(mapStringField(m.getDescription())); + datasource.setEnglishname(mapStringField(m.getEnglishname())); + datasource.setLatitude(mapStringField(m.getLatitude())); + datasource.setLongitude(mapStringField(m.getLongitude())); + datasource.setLogourl(mapStringField(m.getLogourl())); + datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl())); + datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix())); + datasource.setOdcontenttypes(m.getOdcontenttypesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setOdlanguages(m.getOdlanguagesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems())); + datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate())); + datasource.setOdpolicies(mapStringField(m.getOdpolicies())); + datasource.setOfficialname(mapStringField(m.getOfficialname())); + datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility())); + datasource.setPidsystems(mapStringField(m.getPidsystems())); + datasource.setPolicies(m.getPoliciesList() + .stream() + .map(ProtoConverter::mapKV) + .collect(Collectors.toList())); + datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind())); + datasource.setReleaseenddate(mapStringField(m.getReleaseenddate())); + datasource.setServiceprovider(mapBoolField(m.getServiceprovider())); + datasource.setReleasestartdate(mapStringField(m.getReleasestartdate())); + datasource.setSubjects(m.getSubjectsList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + datasource.setVersioning(mapBoolField(m.getVersioning())); + datasource.setWebsiteurl(mapStringField(m.getWebsiteurl())); + datasource.setJournal(mapJournal(m.getJournal())); + + + return datasource; + } + + private static Project convertProject(OafProtos.Oaf oaf) { + final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata(); + final Project project = setOaf(new Project(), oaf); + setEntity(project, oaf); + project.setAcronym(mapStringField(m.getAcronym())); + project.setCallidentifier(mapStringField(m.getCallidentifier())); + project.setCode(mapStringField(m.getCode())); + project.setContactemail(mapStringField(m.getContactemail())); + project.setContactfax(mapStringField(m.getContactfax())); + project.setContactfullname(mapStringField(m.getContactfullname())); + project.setContactphone(mapStringField(m.getContactphone())); + project.setContracttype(mapQualifier(m.getContracttype())); + project.setCurrency(mapStringField(m.getCurrency())); + project.setDuration(mapStringField(m.getDuration())); + project.setEcarticle29_3(mapStringField(m.getEcarticle293())); + project.setEcsc39(mapStringField(m.getEcsc39())); + project.setOamandatepublications(mapStringField(m.getOamandatepublications())); + project.setStartdate(mapStringField(m.getStartdate())); + project.setEnddate(mapStringField(m.getEnddate())); + project.setFundedamount(m.getFundedamount()); + project.setTotalcost(m.getTotalcost()); + project.setKeywords(mapStringField(m.getKeywords())); + project.setSubjects(m.getSubjectsList().stream() + .map(sp -> mapStructuredProperty(sp)) + .collect(Collectors.toList())); + project.setTitle(mapStringField(m.getTitle())); + project.setWebsiteurl(mapStringField(m.getWebsiteurl())); + project.setFundingtree(m.getFundingtreeList().stream() + .map(f -> mapStringField(f)) + .collect(Collectors.toList())); + project.setJsonextrainfo(mapStringField(m.getJsonextrainfo())); + project.setSummary(mapStringField(m.getSummary())); + project.setOptional1(mapStringField(m.getOptional1())); + project.setOptional2(mapStringField(m.getOptional2())); + return project; + } + + private static Result convertResult(OafProtos.Oaf oaf) { + switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) { + case "dataset": + return createDataset(oaf); + case "publication": + return createPublication(oaf); + case "software": + return createSoftware(oaf); + case "other": + return createORP(oaf); + default: + Result result = setOaf(new Result(), oaf); + setEntity(result, oaf); + return setResult(result, oaf); + } + } + + private static Software createSoftware(OafProtos.Oaf oaf) { + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Software software = setOaf(new Software(), oaf); + setEntity(software, oaf); + setResult(software, oaf); + + software.setDocumentationUrl(m.getDocumentationUrlList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + software.setLicense(m.getLicenseList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl())); + software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage())); + return software; + } + + private static OtherResearchProduct createORP(OafProtos.Oaf oaf) { + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf); + setEntity(otherResearchProducts, oaf); + setResult(otherResearchProducts, oaf); + otherResearchProducts.setContactperson(m.getContactpersonList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts.setContactgroup(m.getContactgroupList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts.setTool(m.getToolList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + + return otherResearchProducts; + } + + private static Publication createPublication(OafProtos.Oaf oaf) { + + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Publication publication = setOaf(new Publication(), oaf); + setEntity(publication, oaf); + setResult(publication, oaf); + publication.setJournal(mapJournal(m.getJournal())); + return publication; + } + + private static Dataset createDataset(OafProtos.Oaf oaf) { + + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Dataset dataset = setOaf(new Dataset(), oaf); + setEntity(dataset, oaf); + setResult(dataset, oaf); + dataset.setStoragedate(mapStringField(m.getStoragedate())); + dataset.setDevice(mapStringField(m.getDevice())); + dataset.setSize(mapStringField(m.getSize())); + dataset.setVersion(mapStringField(m.getVersion())); + dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate())); + dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber())); + dataset.setGeolocation(m.getGeolocationList() + .stream() + .map(ProtoConverter::mapGeolocation) + .collect(Collectors.toList())); + return dataset; + + } + + public static T setOaf(T oaf, OafProtos.Oaf o) { + oaf.setDataInfo(mapDataInfo(o.getDataInfo())); + oaf.setLastupdatetimestamp(o.getLastupdatetimestamp()); + return oaf; + } + + public static T setEntity(T entity, OafProtos.Oaf oaf) { + //setting Entity fields + final OafProtos.OafEntity e = oaf.getEntity(); + entity.setId(e.getId()); + entity.setOriginalId(e.getOriginalIdList()); + entity.setCollectedfrom(e.getCollectedfromList() + .stream() + .map(ProtoConverter::mapKV) + .collect(Collectors.toList())); + entity.setPid(e.getPidList().stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setDateofcollection(e.getDateofcollection()); + entity.setDateoftransformation(e.getDateoftransformation()); + entity.setExtraInfo(e.getExtraInfoList() + .stream() + .map(ProtoConverter::mapExtraInfo) + .collect(Collectors.toList())); + return entity; + } + + public static T setResult(T entity, OafProtos.Oaf oaf) { + //setting Entity fields + final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + entity.setAuthor(m.getAuthorList() + .stream() + .map(ProtoConverter::mapAuthor) + .collect(Collectors.toList())); + entity.setResulttype(mapQualifier(m.getResulttype())); + entity.setLanguage(mapQualifier(m.getLanguage())); + entity.setCountry(m.getCountryList() + .stream() + .map(ProtoConverter::mapQualifierAsCountry) + .collect(Collectors.toList())); + entity.setSubject(m.getSubjectList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setTitle(m.getTitleList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setRelevantdate(m.getRelevantdateList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setDescription(m.getDescriptionList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setDateofacceptance(mapStringField(m.getDateofacceptance())); + entity.setPublisher(mapStringField(m.getPublisher())); + entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate())); + entity.setSource(m.getSourceList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setFulltext(m.getFulltextList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setFormat(m.getFormatList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setContributor(m.getContributorList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setResourcetype(mapQualifier(m.getResourcetype())); + entity.setCoverage(m.getCoverageList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setContext(m.getContextList() + .stream() + .map(ProtoConverter::mapContext) + .collect(Collectors.toList())); + + entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList())); + + return entity; + } + + private static Qualifier getBestAccessRights(List instanceList) { + if (instanceList != null) { + final Optional min = instanceList.stream() + .map(i -> i.getAccessright()).min(new LicenseComparator()); + + final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier(); + + if (StringUtils.isBlank(rights.getClassid())) { + rights.setClassid(UNKNOWN); + } + if (StringUtils.isBlank(rights.getClassname()) || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { + rights.setClassname(NOT_AVAILABLE); + } + if (StringUtils.isBlank(rights.getSchemeid())) { + rights.setSchemeid(DNET_ACCESS_MODES); + } + if (StringUtils.isBlank(rights.getSchemename())) { + rights.setSchemename(DNET_ACCESS_MODES); + } + + return rights; + } + return null; + } + + private static Context mapContext(ResultProtos.Result.Context context) { + + final Context entity = new Context(); + entity.setId(context.getId()); + entity.setDataInfo(context.getDataInfoList() + .stream() + .map(ProtoConverter::mapDataInfo) + .collect(Collectors.toList())); + return entity; + } + + + public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { + final KeyValue keyValue = new KeyValue(); + keyValue.setKey(kv.getKey()); + keyValue.setValue(kv.getValue()); + keyValue.setDataInfo(mapDataInfo(kv.getDataInfo())); + return keyValue; + } + + public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) { + final DataInfo dataInfo = new DataInfo(); + dataInfo.setDeletedbyinference(d.getDeletedbyinference()); + dataInfo.setInferenceprovenance(d.getInferenceprovenance()); + dataInfo.setInferred(d.getInferred()); + dataInfo.setInvisible(d.getInvisible()); + dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction())); + dataInfo.setTrust(d.getTrust()); + return dataInfo; + } + + public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) { + final Qualifier qualifier = new Qualifier(); + qualifier.setClassid(q.getClassid()); + qualifier.setClassname(q.getClassname()); + qualifier.setSchemeid(q.getSchemeid()); + qualifier.setSchemename(q.getSchemename()); + return qualifier; + } + + public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) { + final Country c = new Country(); + c.setClassid(q.getClassid()); + c.setClassname(q.getClassname()); + c.setSchemeid(q.getSchemeid()); + c.setSchemename(q.getSchemename()); + c.setDataInfo(mapDataInfo(q.getDataInfo())); + return c; + } + + public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { + final StructuredProperty structuredProperty = new StructuredProperty(); + structuredProperty.setValue(sp.getValue()); + structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); + structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo())); + return structuredProperty; + } + + public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) { + final ExtraInfo entity = new ExtraInfo(); + entity.setName(extraInfo.getName()); + entity.setTypology(extraInfo.getTypology()); + entity.setProvenance(extraInfo.getProvenance()); + entity.setTrust(extraInfo.getTrust()); + entity.setValue(extraInfo.getValue()); + return entity; + } + + public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) { + final OAIProvenance entity = new OAIProvenance(); + entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription())); + return entity; + } + + public static OriginDescription mapOriginalDescription(FieldTypeProtos.OAIProvenance.OriginDescription originDescription) { + final OriginDescription originDescriptionResult = new OriginDescription(); + originDescriptionResult.setHarvestDate(originDescription.getHarvestDate()); + originDescriptionResult.setAltered(originDescription.getAltered()); + originDescriptionResult.setBaseURL(originDescription.getBaseURL()); + originDescriptionResult.setIdentifier(originDescription.getIdentifier()); + originDescriptionResult.setDatestamp(originDescription.getDatestamp()); + originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace()); + return originDescriptionResult; + } + + public static Field mapStringField(FieldTypeProtos.StringField s) { + final Field stringField = new Field<>(); + stringField.setValue(s.getValue()); + stringField.setDataInfo(mapDataInfo(s.getDataInfo())); + return stringField; + } + + public static Field mapBoolField(FieldTypeProtos.BoolField b) { + final Field booleanField = new Field<>(); + booleanField.setValue(b.getValue()); + booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); + return booleanField; + } + + public static Field mapIntField(FieldTypeProtos.IntField b) { + final Field entity = new Field<>(); + entity.setValue(b.getValue()); + entity.setDataInfo(mapDataInfo(b.getDataInfo())); + return entity; + } + + public static Journal mapJournal(FieldTypeProtos.Journal j) { + final Journal journal = new Journal(); + journal.setConferencedate(j.getConferencedate()); + journal.setConferenceplace(j.getConferenceplace()); + journal.setEdition(j.getEdition()); + journal.setEp(j.getEp()); + journal.setIss(j.getIss()); + journal.setIssnLinking(j.getIssnLinking()); + journal.setIssnOnline(j.getIssnOnline()); + journal.setIssnPrinted(j.getIssnPrinted()); + journal.setName(j.getName()); + journal.setSp(j.getSp()); + journal.setVol(j.getVol()); + journal.setDataInfo(mapDataInfo(j.getDataInfo())); + return journal; + } + + public static Author mapAuthor(FieldTypeProtos.Author author) { + final Author entity = new Author(); + entity.setFullname(author.getFullname()); + entity.setName(author.getName()); + entity.setSurname(author.getSurname()); + entity.setRank(author.getRank()); + entity.setPid(author.getPidList() + .stream() + .map(kv -> { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(kv.getValue()); + final Qualifier q = new Qualifier(); + q.setClassid(kv.getKey()); + q.setClassname(kv.getKey()); + sp.setQualifier(q); + return sp; + }) + .collect(Collectors.toList())); + entity.setAffiliation(author.getAffiliationList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + return entity; + + } + + public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) { + final GeoLocation entity = new GeoLocation(); + entity.setPoint(geoLocation.getPoint()); + entity.setBox(geoLocation.getBox()); + entity.setPlace(geoLocation.getPlace()); + return entity; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java new file mode 100644 index 0000000000..cf95711ebc --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java @@ -0,0 +1,159 @@ +package eu.dnetlib.dhp.migration.actions; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.protobuf.InvalidProtocolBufferException; +import eu.dnetlib.actionmanager.actions.AtomicAction; +import eu.dnetlib.data.proto.OafProtos; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.io.IOException; +import java.io.Serializable; +import java.util.LinkedList; + +public class TransformActions implements Serializable { + + private static final Log log = LogFactory.getLog(TransformActions.class); + private static final String SEPARATOR = "/"; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateActionSet.class.getResourceAsStream( + "/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json"))); + parser.parseArgument(args); + + new TransformActions().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, IOException { + + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: " + isLookupUrl); + + final String inputPaths = parser.get("inputPaths"); + + if (StringUtils.isBlank(inputPaths)) { + throw new RuntimeException("empty inputPaths"); + } + log.info("inputPaths: " + inputPaths); + + final String targetBaseDir = getTargetBaseDir(isLookupUrl); + + try(SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + + for(String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) { + + LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath)); + + final String rawset = pathQ.pollLast(); + final String actionSetDirectory = pathQ.pollLast(); + + final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset); + + if (fs.exists(targetDirectory)) { + log.info(String.format("found target directory '%s", targetDirectory)); + fs.delete(targetDirectory, true); + log.info(String.format("deleted target directory '%s", targetDirectory)); + } + + log.info(String.format("transforming actions from '%s' to '%s'", sourcePath, targetDirectory)); + + sc.sequenceFile(sourcePath, Text.class, Text.class) + .mapToPair(a -> new Tuple2<>(a._1(), AtomicAction.fromJSON(a._2().toString()))) + .mapToPair(a -> new Tuple2<>(a._1(), transformAction(a._1().toString(), a._2()))) + + .saveAsHadoopFile(targetDirectory.toString(), Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + } + } + } + + private Text transformAction(String atomicaActionId, AtomicAction aa) throws InvalidProtocolBufferException, JsonProcessingException { + + final ObjectMapper mapper = new ObjectMapper(); + if (aa.getTargetValue() != null && aa.getTargetValue().length > 0) { + Oaf oaf = ProtoConverter.convert(OafProtos.Oaf.parseFrom(aa.getTargetValue())); + aa.setTargetValue(mapper.writeValueAsString(oaf).getBytes()); + } else { + + if (atomicaActionId.contains("dedupSimilarity")) { + + final String[] splitId = atomicaActionId.split("@"); + + String source = splitId[0]; + String target = splitId[2]; + + String[] relSemantic = splitId[1].split("_"); + + Relation rel = new Relation(); + rel.setSource(source); + rel.setTarget(target); + rel.setRelType(relSemantic[0]); + rel.setSubRelType(relSemantic[1]); + rel.setRelClass(relSemantic[2]); + + DataInfo d = new DataInfo(); + d.setDeletedbyinference(false); + d.setInferenceprovenance("deduplication"); + d.setInferred(true); + d.setInvisible(false); + Qualifier provenanceaction = new Qualifier(); + + provenanceaction.setClassid("deduplication"); + provenanceaction.setClassname("deduplication"); + provenanceaction.setSchemeid("dnet:provenanceActions"); + provenanceaction.setSchemename("dnet:provenanceActions"); + + d.setProvenanceaction(provenanceaction); + + rel.setDataInfo(d); + + aa.setTargetValue(mapper.writeValueAsString(rel).getBytes()); + } + } + + return new Text(mapper.writeValueAsString(aa)); + } + + private String getTargetBaseDir(String isLookupUrl) throws ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()"; + return isLookUp.getResourceProfileByQuery(XQUERY); + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(TransformActions.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplication.java similarity index 81% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplication.java index d22e8e5b36..5e54c2b86d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplication.java @@ -1,4 +1,14 @@ -package eu.dnetlib.dhp.migration; +package eu.dnetlib.dhp.migration.step1; + +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.asString; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listKeyValues; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty; import java.io.Closeable; import java.io.IOException; @@ -17,18 +27,26 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication; +import eu.dnetlib.dhp.migration.utils.DbClient; +import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Journal; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable { +public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable { private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions"); @@ -50,32 +68,36 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl final String dbPassword = parser.get("postgresPassword"); final String hdfsPath = parser.get("hdfsPath"); - final String hdfsNameNode = parser.get("namenode"); - final String hdfsUser = parser.get("hdfsUser"); - try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) { - log.info("Processing datasources..."); - smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); - log.info("Processing projects..."); - smdbe.execute("queryProjects.sql", smdbe::processProject); + try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) { + if (processClaims) { + log.info("Processing claims..."); + smdbe.execute("queryClaims.sql", smdbe::processClaims); + } else { + log.info("Processing datasources..."); + smdbe.execute("queryDatasources.sql", smdbe::processDatasource); - log.info("Processing orgs..."); - smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + log.info("Processing projects..."); + smdbe.execute("queryProjects.sql", smdbe::processProject); - log.info("Processing relations ds <-> orgs ..."); - smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + log.info("Processing orgs..."); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); - log.info("Processing projects <-> orgs ..."); - smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + log.info("Processing relations ds <-> orgs ..."); + smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + log.info("Processing projects <-> orgs ..."); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + } log.info("All done."); } } - public MigrateDbEntitiesApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String dbUrl, final String dbUser, + public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser, final String dbPassword) throws Exception { - super(hdfsPath, hdfsNameNode, hdfsUser); + super(hdfsPath); this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); this.lastUpdateTimestamp = new Date().getTime(); } @@ -93,7 +115,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl final Datasource ds = new Datasource(); - ds.setId(createOpenaireId(10, rs.getString("datasourceid"))); + ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); ds.setPid(new ArrayList<>()); @@ -200,7 +222,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl final Project p = new Project(); - p.setId(createOpenaireId(40, rs.getString("projectid"))); + p.setId(createOpenaireId(40, rs.getString("projectid"), true)); p.setOriginalId(Arrays.asList(rs.getString("projectid"))); p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); p.setPid(new ArrayList<>()); @@ -290,7 +312,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl final Organization o = new Organization(); - o.setId(createOpenaireId(20, rs.getString("organizationid"))); + o.setId(createOpenaireId(20, rs.getString("organizationid"), true)); o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); o.setPid(new ArrayList<>()); @@ -354,8 +376,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl try { final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId(20, rs.getString("organization")); - final String dsId = createOpenaireId(10, rs.getString("datasource")); + final String orgId = createOpenaireId(20, rs.getString("organization"), true); + final String dsId = createOpenaireId(10, rs.getString("datasource"), true); final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final Relation r1 = new Relation(); @@ -377,7 +399,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl r2.setTarget(dsId); r2.setCollectedFrom(collectedFrom); r2.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); + r2.setLastupdatetimestamp(lastUpdateTimestamp); emitOaf(r2); // rs.getString("datasource"); @@ -403,8 +425,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl try { final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId(20, rs.getString("resporganization")); - final String projectId = createOpenaireId(40, rs.getString("project")); + final String orgId = createOpenaireId(20, rs.getString("resporganization"), true); + final String projectId = createOpenaireId(40, rs.getString("project"), true); final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final Relation r1 = new Relation(); @@ -426,7 +448,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl r2.setTarget(projectId); r2.setCollectedFrom(collectedFrom); r2.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); + r2.setLastupdatetimestamp(lastUpdateTimestamp); emitOaf(r2); // rs.getString("project"); @@ -450,6 +472,81 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl } } + public void processClaims(final ResultSet rs) { + + final DataInfo info = + dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9"); + + try { + + if (rs.getString("source_type").equals("context")) { + final Result r; + + if (rs.getString("target_type").equals("dataset")) { + r = new Dataset(); + } else if (rs.getString("target_type").equals("software")) { + r = new Software(); + } else if (rs.getString("target_type").equals("other")) { + r = new OtherResearchProduct(); + } else { + r = new Publication(); + } + r.setId(createOpenaireId(50, rs.getString("target_id"), false)); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setContext(prepareContext(rs.getString("source_id"), info)); + r.setDataInfo(info); + emitOaf(r); + } else { + final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false); + final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false); + + final Relation r1 = new Relation(); + final Relation r2 = new Relation(); + + if (rs.getString("source_type").equals("project")) { + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("produces"); + + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("isProducedBy"); + } else { + r1.setRelType("resultResult"); + r1.setSubRelType("relationship"); + r1.setRelClass("isRelatedTo"); + + r2.setRelType("resultResult"); + r2.setSubRelType("relationship"); + r2.setRelClass("isRelatedTo"); + } + + r1.setSource(sourceId); + r1.setTarget(targetId); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + emitOaf(r1); + + r2.setSource(targetId); + r2.setTarget(sourceId); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + emitOaf(r2); + + } + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + private List prepareContext(final String id, final DataInfo dataInfo) { + final Context context = new Context(); + context.setId(id); + context.setDataInfo(Arrays.asList(dataInfo)); + return Arrays.asList(context); + } + private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); final String inferenceprovenance = rs.getString("inferenceprovenance"); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateMongoMdstoresApplication.java new file mode 100644 index 0000000000..b1de31326b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateMongoMdstoresApplication.java @@ -0,0 +1,67 @@ +package eu.dnetlib.dhp.migration.step1; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication; +import eu.dnetlib.dhp.migration.utils.MdstoreClient; + +public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable { + + private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); + + private final MdstoreClient mdstoreClient; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json"))); + parser.parseArgument(args); + + final String mongoBaseUrl = parser.get("mongoBaseUrl"); + final String mongoDb = parser.get("mongoDb"); + + final String mdFormat = parser.get("mdFormat"); + final String mdLayout = parser.get("mdLayout"); + final String mdInterpretation = parser.get("mdInterpretation"); + + final String hdfsPath = parser.get("hdfsPath"); + + try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) { + app.execute(mdFormat, mdLayout, mdInterpretation); + } + + } + + public MigrateMongoMdstoresApplication(final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception { + super(hdfsPath); + this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); + } + + public void execute(final String format, final String layout, final String interpretation) { + final Map colls = mdstoreClient.validCollections(format, layout, interpretation); + log.info("Found " + colls.size() + " mdstores"); + + for (final Entry entry : colls.entrySet()) { + log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")"); + final String currentColl = entry.getValue(); + + for (final String xml : mdstoreClient.listRecords(currentColl)) { + emit(xml, "native_" + format); + } + } + } + + @Override + public void close() throws IOException { + super.close(); + mdstoreClient.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/AbstractMdRecordToOafMapper.java similarity index 80% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/AbstractMdRecordToOafMapper.java index 00d1aa60d6..7c3000fbad 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/AbstractMdRecordToOafMapper.java @@ -1,20 +1,24 @@ -package eu.dnetlib.dhp.migration; +package eu.dnetlib.dhp.migration.step2; + +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.keyValue; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.oaiIProvenance; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty; -import java.io.IOException; -import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.dom4j.Document; -import org.dom4j.DocumentException; import org.dom4j.DocumentFactory; import org.dom4j.DocumentHelper; import org.dom4j.Node; @@ -37,11 +41,9 @@ import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { +public abstract class AbstractMdRecordToOafMapper { - protected final Map code2name = new HashMap<>(); - - protected final MdstoreClient mdstoreClient; + protected final Map code2name; protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); @@ -51,79 +53,36 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); - private static final Log log = LogFactory.getLog(AbstractMongoExecutor.class); - - public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, - final String mongoDb, final String dbUrl, final String dbUser, - final String dbPassword) throws Exception { - - super(hdfsPath, hdfsNameNode, hdfsUser); - - this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); - loadClassNames(dbUrl, dbUser, dbPassword); - - final Map nsContext = new HashMap<>(); - - registerNamespaces(nsContext); - - DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + protected AbstractMdRecordToOafMapper(final Map code2name) { + this.code2name = code2name; } - private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + public List processMdRecord(final String xml) { + try { + final Map nsContext = new HashMap<>(); + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); - log.info("Loading vocabulary terms from db..."); + final Document doc = DocumentHelper.parseText(xml); - try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { - code2name.clear(); - dbClient.processResults("select code, name from class", rs -> { - try { - code2name.put(rs.getString("code"), rs.getString("name")); - } catch (final SQLException e) { - e.printStackTrace(); - } - }); + final String type = doc.valueOf("//dr:CobjCategory/@type"); + final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name")); + final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom + : keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name")); + + final DataInfo info = prepareDataInfo(doc); + final long lastUpdateTimestamp = new Date().getTime(); + + return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); + } catch (final Exception e) { + throw new RuntimeException(e); } - - log.info("Found " + code2name.size() + " terms."); - - } - - public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException { - - log.info(String.format("Searching mdstores (format: %s, layout: %s, interpretation: %s)", mdFormat, mdLayout, mdInterpretation)); - - final Map colls = mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation); - log.info("Found " + colls.size() + " mdstores"); - - for (final Entry entry : colls.entrySet()) { - log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")"); - final String currentColl = entry.getValue(); - - for (final String xml : mdstoreClient.listRecords(currentColl)) { - final Document doc = DocumentHelper.parseText(xml); - - final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name")); - final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom - : keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name")); - - final DataInfo info = prepareDataInfo(doc); - final long lastUpdateTimestamp = new Date().getTime(); - - for (final Oaf oaf : createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp)) { - emitOaf(oaf); - } - } - } - log.info("All Done."); - } - - protected void registerNamespaces(final Map nsContext) { - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); } protected List createOafs(final Document doc, @@ -194,10 +153,10 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { final List res = new ArrayList<>(); - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); for (final Object o : doc.selectNodes("//oaf:projectid")) { - final String projectId = createOpenaireId(40, ((Node) o).getText()); + final String projectId = createOpenaireId(40, ((Node) o).getText(), true); final Relation r1 = new Relation(); r1.setRelType("resultProject"); @@ -238,7 +197,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { final long lastUpdateTimestamp) { r.setDataInfo(info); r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"))); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); @@ -398,6 +357,8 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected DataInfo prepareDataInfo(final Document doc) { final Node n = doc.selectSingleNode("//oaf:datainfo"); + if (n == null) { return null; } + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); @@ -430,10 +391,4 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { return res; } - @Override - public void close() throws IOException { - super.close(); - mdstoreClient.close(); - } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/GenerateEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/GenerateEntitiesApplication.java new file mode 100644 index 0000000000..775e5e7d8e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/GenerateEntitiesApplication.java @@ -0,0 +1,172 @@ +package eu.dnetlib.dhp.migration.step2; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication; +import eu.dnetlib.dhp.migration.utils.DbClient; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; +import scala.Tuple2; + +public class GenerateEntitiesApplication { + + private static final Log log = LogFactory.getLog(GenerateEntitiesApplication.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/migration/generate_entities_parameters.json"))); + + parser.parseArgument(args); + + final String sourcePaths = parser.get("sourcePaths"); + final String targetPath = parser.get("targetPath"); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); + + try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) { + final List existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList()); + generateEntities(sc, code2name, existingSourcePaths, targetPath); + } + } + + private static SparkSession newSparkSession(final ArgumentApplicationParser parser) { + return SparkSession + .builder() + .appName(GenerateEntitiesApplication.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + } + + private static void generateEntities(final JavaSparkContext sc, + final Map code2name, + final List sourcePaths, + final String targetPath) { + + log.info("Generate entities from files:"); + sourcePaths.forEach(log::info); + + JavaRDD inputRdd = sc.emptyRDD(); + + for (final String sp : sourcePaths) { + inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .flatMap(list -> list.iterator()) + .map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf))); + } + + inputRdd.saveAsTextFile(targetPath); + + } + + private static List convertToListOaf(final String id, final String s, final Map code2name) { + final String type = StringUtils.substringAfter(id, ":"); + + switch (type.toLowerCase()) { + case "native_oaf": + return new OafToOafMapper(code2name).processMdRecord(s); + case "native_odf": + return new OdfToOafMapper(code2name).processMdRecord(s); + case "datasource": + return Arrays.asList(convertFromJson(s, Datasource.class)); + case "organization": + return Arrays.asList(convertFromJson(s, Organization.class)); + case "project": + return Arrays.asList(convertFromJson(s, Project.class)); + case "relation": + return Arrays.asList(convertFromJson(s, Relation.class)); + case "publication": + return Arrays.asList(convertFromJson(s, Publication.class)); + case "dataset": + return Arrays.asList(convertFromJson(s, Dataset.class)); + case "software": + return Arrays.asList(convertFromJson(s, Software.class)); + case "otherresearchproducts": + default: + return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); + } + + } + + private static Map loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + + log.info("Loading vocabulary terms from db..."); + + final Map map = new HashMap<>(); + + try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { + dbClient.processResults("select code, name from class", rs -> { + try { + map.put(rs.getString("code"), rs.getString("name")); + } catch (final SQLException e) { + e.printStackTrace(); + } + }); + } + + log.info("Found " + map.size() + " terms."); + + return map; + + } + + private static String convertToJson(final Oaf oaf) { + try { + return new ObjectMapper().writeValueAsString(oaf); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + private static Oaf convertFromJson(final String s, final Class clazz) { + try { + return new ObjectMapper().readValue(s, clazz); + } catch (final Exception e) { + log.error("Error parsing object of class: " + clazz); + log.error(s); + throw new RuntimeException(e); + } + } + + private static boolean exists(final JavaSparkContext context, final String pathToFile) { + try { + final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration()); + final Path path = new Path(pathToFile); + return hdfs.exists(path); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OafToOafMapper.java similarity index 89% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OafToOafMapper.java index c325682905..110abc4864 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OafToOafMapper.java @@ -1,16 +1,17 @@ -package eu.dnetlib.dhp.migration; +package eu.dnetlib.dhp.migration.step2; + +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.Node; -import eu.dnetlib.dhp.migration.pace.PacePerson; +import eu.dnetlib.dhp.migration.utils.PacePerson; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -22,20 +23,10 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class OafMigrationExecutor extends AbstractMongoExecutor { +public class OafToOafMapper extends AbstractMdRecordToOafMapper { - private static final Log log = LogFactory.getLog(OafMigrationExecutor.class); - - public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb, - final String dbUrl, final String dbUser, - final String dbPassword) throws Exception { - super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword); - } - - @Override - protected void registerNamespaces(final Map nsContext) { - super.registerNamespaces(nsContext); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + public OafToOafMapper(final Map code2name) { + super(code2name); } @Override @@ -211,12 +202,12 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); final List res = new ArrayList<>(); for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { - final String otherId = createOpenaireId(50, ((Node) o).getText()); + final String otherId = createOpenaireId(50, ((Node) o).getText(), false); final Relation r1 = new Relation(); r1.setRelType("resultResult"); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OdfToOafMapper.java similarity index 73% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OdfToOafMapper.java index 457534085a..b4868b8f9b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OdfToOafMapper.java @@ -1,4 +1,8 @@ -package eu.dnetlib.dhp.migration; +package eu.dnetlib.dhp.migration.step2; + +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty; import java.util.ArrayList; import java.util.Arrays; @@ -6,8 +10,6 @@ import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.Node; @@ -22,38 +24,28 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class OdfMigrationExecutor extends AbstractMongoExecutor { +public class OdfToOafMapper extends AbstractMdRecordToOafMapper { - private static final Log log = LogFactory.getLog(OdfMigrationExecutor.class); - - public OdfMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb, - final String dbUrl, final String dbUser, - final String dbPassword) throws Exception { - super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword); - } - - @Override - protected void registerNamespaces(final Map nsContext) { - super.registerNamespaces(nsContext); - nsContext.put("dc", "http://datacite.org/schema/kernel-3"); + public OdfToOafMapper(final Map code2name) { + super(code2name); } @Override protected List prepareTitles(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); + return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info); } @Override protected List prepareAuthors(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); int pos = 1; - for (final Object o : doc.selectNodes("//dc:creator")) { + for (final Object o : doc.selectNodes("//datacite:creator")) { final Node n = (Node) o; final Author author = new Author(); - author.setFullname(n.valueOf("./dc:creatorName")); - author.setName(n.valueOf("./dc:givenName")); - author.setSurname(n.valueOf("./dc:familyName")); - author.setAffiliation(prepareListFields(doc, "./dc:affiliation", info)); + author.setFullname(n.valueOf("./datacite:creatorName")); + author.setName(n.valueOf("./datacite:givenName")); + author.setSurname(n.valueOf("./datacite:familyName")); + author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); author.setPid(preparePids(doc, info)); author.setRank(pos++); res.add(author); @@ -63,7 +55,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { private List preparePids(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("./dc:nameIdentifier")) { + for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info)); } return res; @@ -72,7 +64,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) { + for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { final Instance instance = new Instance(); instance.setUrl(Arrays.asList(((Node) o).getText().trim())); instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); @@ -98,7 +90,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected List prepareRelevantDates(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//dc:date")) { + for (final Object o : doc.selectNodes("//datacite:date")) { final String dateType = ((Node) o).valueOf("@dateType"); if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued") && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { @@ -115,32 +107,32 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected List> prepareContributors(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:contributorName", info); + return prepareListFields(doc, "//datacite:contributorName", info); } @Override protected List> prepareFormats(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:format", info); + return prepareListFields(doc, "//datacite:format", info); } @Override protected Field preparePublisher(final Document doc, final DataInfo info) { - return prepareField(doc, "//dc:publisher", info); + return prepareField(doc, "//datacite:publisher", info); } @Override protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:description[@descriptionType='Abstract']", info); + return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); } @Override protected List prepareSubjects(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//dc:subject", info); + return prepareListStructProps(doc, "//datacite:subject", info); } @Override protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); + return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages"); } @Override @@ -150,17 +142,17 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:contributor[@contributorType='ContactGroup']/dc:contributorName", info); + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:contributor[@contributorType='ContactPerson']/dc:contributorName", info); + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); } @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return prepareQualifier(doc, "//dc:format", "dnet:programming_languages", "dnet:programming_languages"); + return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); } @Override @@ -175,7 +167,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); + return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @@ -184,11 +176,11 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//dc:geoLocation")) { + for (final Object o : doc.selectNodes("//datacite:geoLocation")) { final GeoLocation loc = new GeoLocation(); - loc.setBox(((Node) o).valueOf("./dc:geoLocationBox")); - loc.setPlace(((Node) o).valueOf("./dc:geoLocationPlace")); - loc.setPoint(((Node) o).valueOf("./dc:geoLocationPoint")); + loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox")); + loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace")); + loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint")); res.add(loc); } return res; @@ -201,17 +193,17 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { - return prepareField(doc, "//dc:date[@dateType='Updated']", info); + return prepareField(doc, "//datacite:date[@dateType='Updated']", info); } @Override protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - return prepareField(doc, "//dc:version", info); + return prepareField(doc, "//datacite:version", info); } @Override protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - return prepareField(doc, "//dc:size", info); + return prepareField(doc, "//datacite:size", info); } @Override @@ -221,18 +213,18 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - return prepareField(doc, "//dc:date[@dateType='Issued']", info); + return prepareField(doc, "//datacite:date[@dateType='Issued']", info); } @Override protected List addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) { - final String otherId = createOpenaireId(50, ((Node) o).getText()); + for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) { + final String otherId = createOpenaireId(50, ((Node) o).getText(), false); final String type = ((Node) o).valueOf("@relationType"); if (type.equals("IsSupplementTo")) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step3/DispatchEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step3/DispatchEntitiesApplication.java new file mode 100644 index 0000000000..4f10068e77 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step3/DispatchEntitiesApplication.java @@ -0,0 +1,70 @@ +package eu.dnetlib.dhp.migration.step3; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class DispatchEntitiesApplication { + + private static final Log log = LogFactory.getLog(DispatchEntitiesApplication.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json"))); + parser.parseArgument(args); + + try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) { + + final String sourcePath = parser.get("sourcePath"); + final String targetPath = parser.get("graphRawPath"); + + processEntity(sc, Publication.class, sourcePath, targetPath); + processEntity(sc, Dataset.class, sourcePath, targetPath); + processEntity(sc, Software.class, sourcePath, targetPath); + processEntity(sc, OtherResearchProduct.class, sourcePath, targetPath); + processEntity(sc, Datasource.class, sourcePath, targetPath); + processEntity(sc, Organization.class, sourcePath, targetPath); + processEntity(sc, Project.class, sourcePath, targetPath); + processEntity(sc, Relation.class, sourcePath, targetPath); + } + } + + private static SparkSession newSparkSession(final ArgumentApplicationParser parser) { + return SparkSession + .builder() + .appName(DispatchEntitiesApplication.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + } + + private static void processEntity(final JavaSparkContext sc, final Class clazz, final String sourcePath, final String targetPath) { + final String type = clazz.getSimpleName().toLowerCase(); + + log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath)); + + sc.textFile(sourcePath) + .filter(l -> isEntityType(l, type)) + .map(l -> StringUtils.substringAfter(l, "|")) + .saveAsTextFile(targetPath + "/" + type); // use repartition(XXX) ??? + } + + private static boolean isEntityType(final String line, final String type) { + return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/AbstractMigrationApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/AbstractMigrationApplication.java new file mode 100644 index 0000000000..8eb444562c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/AbstractMigrationApplication.java @@ -0,0 +1,77 @@ +package eu.dnetlib.dhp.migration.utils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.codehaus.jackson.map.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Oaf; + +public class AbstractMigrationApplication implements Closeable { + + private final AtomicInteger counter = new AtomicInteger(0); + + private final Text key = new Text(); + + private final Text value = new Text(); + + private final SequenceFile.Writer writer; + + private final ObjectMapper objectMapper = new ObjectMapper(); + + private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class); + + public AbstractMigrationApplication(final String hdfsPath) throws Exception { + + log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath)); + + this.writer = SequenceFile.createWriter(getConf(), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer + .keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class)); + } + + private Configuration getConf() throws IOException { + final Configuration conf = new Configuration(); + /* + * conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + * conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser); + * System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf); + */ + return conf; + } + + protected void emit(final String s, final String type) { + try { + key.set(counter.getAndIncrement() + ":" + type); + value.set(s); + writer.append(key, value); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + protected void emitOaf(final Oaf oaf) { + try { + emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase()); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public ObjectMapper getObjectMapper() { + return objectMapper; + } + + @Override + public void close() throws IOException { + writer.hflush(); + writer.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/DbClient.java similarity index 78% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/DbClient.java index 9ac0089d25..8e97843464 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/DbClient.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.migration; +package eu.dnetlib.dhp.migration.utils; import java.io.Closeable; import java.io.IOException; @@ -28,8 +28,8 @@ public class DbClient implements Closeable { StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address); this.connection.setAutoCommit(false); } catch (final Exception e) { - log.error(e.getClass().getName() + ": " + e.getMessage()); - throw new RuntimeException(e); + log.error("Connection to postgresDB failed"); + throw new RuntimeException("Connection to postgresDB failed", e); } log.info("Opened database successfully"); } @@ -44,10 +44,12 @@ public class DbClient implements Closeable { consumer.accept(rs); } } catch (final SQLException e) { - throw new RuntimeException(e); + log.error("Error executing sql query: " + sql, e); + throw new RuntimeException("Error executing sql query", e); } } catch (final SQLException e1) { - throw new RuntimeException(e1); + log.error("Error preparing sql statement", e1); + throw new RuntimeException("Error preparing sql statement", e1); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/MdstoreClient.java similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/MdstoreClient.java index 87dadfc7aa..612503da74 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/MdstoreClient.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.migration; +package eu.dnetlib.dhp.migration.utils; import java.io.Closeable; import java.io.IOException; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/OafMapperUtils.java similarity index 67% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/OafMapperUtils.java index e91a530458..8e51c1858e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/OafMapperUtils.java @@ -1,24 +1,12 @@ -package eu.dnetlib.dhp.migration; +package eu.dnetlib.dhp.migration.utils; -import java.io.Closeable; -import java.io.IOException; -import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; -import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.codehaus.jackson.map.ObjectMapper; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.ExtraInfo; @@ -26,60 +14,12 @@ import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Journal; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.OAIProvenance; -import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OriginDescription; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; -public class AbstractMigrationExecutor implements Closeable { - - private final AtomicInteger counter = new AtomicInteger(0); - - private final Text key = new Text(); - - private final Text value = new Text(); - - private final ObjectMapper objectMapper = new ObjectMapper(); - - private final SequenceFile.Writer writer; - - private static final Log log = LogFactory.getLog(AbstractMigrationExecutor.class); - - public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception { - - log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s, nameNode=%s, user=%s", hdfsPath, hdfsNameNode, hdfsUser)); - - this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer - .keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class)); - } - - private Configuration getConf(final String hdfsNameNode, final String hdfsUser) throws IOException { - final Configuration conf = new Configuration(); - conf.set("fs.defaultFS", hdfsNameNode); - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - System.setProperty("HADOOP_USER_NAME", hdfsUser); - System.setProperty("hadoop.home.dir", "/"); - FileSystem.get(URI.create(hdfsNameNode), conf); - return conf; - } - - protected void emitOaf(final Oaf oaf) { - try { - key.set(counter.getAndIncrement() + ":" + oaf.getClass().getSimpleName().toLowerCase()); - value.set(objectMapper.writeValueAsString(oaf)); - writer.append(key, value); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - @Override - public void close() throws IOException { - writer.hflush(); - writer.close(); - } +public class OafMapperUtils { public static KeyValue keyValue(final String k, final String v) { final KeyValue kv = new KeyValue(); @@ -223,14 +163,33 @@ public class AbstractMigrationExecutor implements Closeable { return d; } - public static String createOpenaireId(final int prefix, final String originalId) { - final String nsPrefix = StringUtils.substringBefore(originalId, "::"); - final String rest = StringUtils.substringAfter(originalId, "::"); - return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + public static String createOpenaireId(final int prefix, final String originalId, final boolean to_md5) { + if (to_md5) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + } else { + return String.format("%s|%s", prefix, originalId); + } + } + public static String createOpenaireId(final String type, final String originalId, final boolean to_md5) { + switch (type) { + case "datasource": + return createOpenaireId(10, originalId, to_md5); + case "organization": + return createOpenaireId(20, originalId, to_md5); + case "person": + return createOpenaireId(30, originalId, to_md5); + case "project": + return createOpenaireId(40, originalId, to_md5); + default: + return createOpenaireId(50, originalId, to_md5); + } } public static String asString(final Object o) { return o == null ? "" : o.toString(); } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/PacePerson.java similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/PacePerson.java index 927f5641bf..69e128e63c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/PacePerson.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.migration.pace; +package eu.dnetlib.dhp.migration.utils; import java.nio.charset.Charset; import java.text.Normalizer; diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json similarity index 63% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json index f179ee0f80..8c81290ca2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json @@ -2,7 +2,7 @@ { "paramName": "s", "paramLongName": "sourcePath", - "paramDescription": "the HDFS source path which contains the sequential file", + "paramDescription": "the source path", "paramRequired": true }, { @@ -16,11 +16,5 @@ "paramLongName": "graphRawPath", "paramDescription": "the path of the graph Raw in hdfs", "paramRequired": true - }, - { - "paramName": "e", - "paramLongName": "entity", - "paramDescription": "The entity to extract", - "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/generate_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/generate_entities_parameters.json new file mode 100644 index 0000000000..53ee010c45 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/generate_entities_parameters.json @@ -0,0 +1,39 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePaths", + "paramDescription": "the HDFS source paths which contains the sequential file (comma separated)", + "paramRequired": true + }, + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the path of the target file", + "paramRequired": true + }, + { + "paramName": "pgurl", + "paramLongName": "postgresUrl", + "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", + "paramRequired": true + }, + { + "paramName": "pguser", + "paramLongName": "postgresUser", + "paramDescription": "postgres user", + "paramRequired": false + }, + { + "paramName": "pgpasswd", + "paramLongName": "postgresPassword", + "paramDescription": "postgres password", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json new file mode 100644 index 0000000000..c4910ec61b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json @@ -0,0 +1,10 @@ +[ + {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, + {"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true}, + {"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true}, + {"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true}, + {"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true}, + {"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true}, + {"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true} +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json index 5e9f378f55..cb13ff0242 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json @@ -6,33 +6,27 @@ "paramRequired": true }, { - "paramName": "n", - "paramLongName": "namenode", - "paramDescription": "the Name Node URI", - "paramRequired": true - }, - { - "paramName": "u", - "paramLongName": "hdfsUser", - "paramDescription": "the user wich create the hdfs seq file", - "paramRequired": true - }, - { - "paramName": "dburl", + "paramName": "pgurl", "paramLongName": "postgresUrl", "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", "paramRequired": true }, { - "paramName": "dbuser", + "paramName": "pguser", "paramLongName": "postgresUser", "paramDescription": "postgres user", "paramRequired": false }, { - "paramName": "dbpasswd", + "paramName": "pgpasswd", "paramLongName": "postgresPassword", "paramDescription": "postgres password", "paramRequired": false + }, + { + "paramName": "a", + "paramLongName": "action", + "paramDescription": "process claims", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json index 5738daa762..ee1a6ac4ee 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json @@ -5,18 +5,6 @@ "paramDescription": "the path where storing the sequential file", "paramRequired": true }, - { - "paramName": "n", - "paramLongName": "namenode", - "paramDescription": "the Name Node URI", - "paramRequired": true - }, - { - "paramName": "u", - "paramLongName": "hdfsUser", - "paramDescription": "the user wich create the hdfs seq file", - "paramRequired": true - }, { "paramName": "mongourl", "paramLongName": "mongoBaseUrl", @@ -24,7 +12,7 @@ "paramRequired": true }, { - "paramName": "db", + "paramName": "mongodb", "paramLongName": "mongoDb", "paramDescription": "mongo database", "paramRequired": true @@ -46,23 +34,5 @@ "paramLongName": "mdInterpretation", "paramDescription": "metadata interpretation", "paramRequired": true - }, - { - "paramName": "pgurl", - "paramLongName": "postgresUrl", - "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", - "paramRequired": true - }, - { - "paramName": "pguser", - "paramLongName": "postgresUser", - "paramDescription": "postgres user", - "paramRequired": false - }, - { - "paramName": "pgpasswd", - "paramLongName": "postgresPassword", - "paramDescription": "postgres password", - "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml deleted file mode 100644 index ff23fff4a3..0000000000 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml +++ /dev/null @@ -1,282 +0,0 @@ - - - - workingPath - the base path to store hdfs file - - - graphRawPath - the graph Raw base path - - - - postgresURL - the postgres URL to access to the database - - - postgresUser - the user postgres - - - postgresPassword - the password postgres - - - mongourl - mongoDB url, example: mongodb://[username:password@]host[:port] - - - mongoDb - mongo database - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.migration.MigrateDbEntitiesApplication - -p${workingPath}/db_entities - -n${nameNode} - -u${hdfsUser} - -dburl${postgresURL} - -dbuser${postgresUser} - -dbpasswd${postgresPassword} - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication - -p${workingPath}/odf_entities - -n${nameNode} - -u${hdfsUser} - -mongourl${mongourl} - -db${mongoDb} - -fODF - -lstore - -icleaned - -pgurl${postgresURL} - -pguser${postgresUser} - -pgpasswd${postgresPassword} - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication - -p${workingPath}/oaf_entities - -n${nameNode} - -u${hdfsUser} - -mongourl${mongourl} - -db${mongoDb} - -fOAF - -lstore - -icleaned - -pgurl${postgresURL} - -pguser${postgresUser} - -pgpasswd${postgresPassword} - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: publication - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/publication - -epublication - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: dataset - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/dataset - -edataset - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: software - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/software - -esoftware - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: otherresearchproduct - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/otherresearchproduct - -eotherresearchproduct - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: datasource - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/datasource - -edatasource - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: organization - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/organization - -eorganization - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: project - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/project - -eproject - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: relation - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/relation - -erelation - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql new file mode 100644 index 0000000000..0390c11aa3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql @@ -0,0 +1 @@ +SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql new file mode 100644 index 0000000000..6cff188756 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql @@ -0,0 +1,90 @@ +SELECT + p.id AS projectid, + p.code AS code, + p.websiteurl AS websiteurl, + p.acronym AS acronym, + p.title AS title, + p.startdate AS startdate, + p.enddate AS enddate, + p.call_identifier AS callidentifier, + p.keywords AS keywords, + p.duration AS duration, + p.ec_sc39 AS ecsc39, + p.oa_mandate_for_publications AS oamandatepublications, + p.ec_article29_3 AS ecarticle29_3, + p.dateofcollection AS dateofcollection, + p.lastupdate AS dateoftransformation, + p.inferred AS inferred, + p.deletedbyinference AS deletedbyinference, + p.trust AS trust, + p.inferenceprovenance AS inferenceprovenance, + p.optional1 AS optional1, + p.optional2 AS optional2, + p.jsonextrainfo AS jsonextrainfo, + p.contactfullname AS contactfullname, + p.contactfax AS contactfax, + p.contactphone AS contactphone, + p.contactemail AS contactemail, + p.summary AS summary, + p.currency AS currency, + p.totalcost AS totalcost, + p.fundedamount AS fundedamount, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype, + pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction, + array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid, + array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects, + array_agg(DISTINCT fp.path) AS fundingtree + FROM projects p + LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass) + LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme) + + LEFT OUTER JOIN projectpids pp ON (pp.project = p.id) + LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid) + + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom) + + LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id) + LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding) + + LEFT OUTER JOIN project_subject ps ON (ps.project = p.id) + LEFT OUTER JOIN subjects s ON (s.id = ps.subject) + + LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass) + LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme) + + LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass) + LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme) + + GROUP BY + p.id, + p.code, + p.websiteurl, + p.acronym, + p.title, + p.startdate, + p.enddate, + p.call_identifier, + p.keywords, + p.duration, + p.ec_sc39, + p.oa_mandate_for_publications, + p.ec_article29_3, + p.dateofcollection, + p.inferred, + p.deletedbyinference, + p.trust, + p.inferenceprovenance, + p.contactfullname, + p.contactfax, + p.contactphone, + p.contactemail, + p.summary, + p.currency, + p.totalcost, + p.fundedamount, + dc.id, + dc.officialname, + pac.code, pac.name, pas.code, pas.name, + ctc.code, ctc.name, cts.code, cts.name; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json new file mode 100644 index 0000000000..ce72f53ca6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json @@ -0,0 +1,5 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, + {"paramName":"i", "paramLongName":"inputPaths", "paramDescription": "URL of the isLookUp Service", "paramRequired": true} +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/config-default.xml new file mode 100644 index 0000000000..9637ebdc62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + sourceNN + webhdfs://namenode2.hadoop.dm.openaire.eu:50071 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088 + + + spark2EventLogDir + /user/spark/applicationHistory + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/workflow.xml new file mode 100644 index 0000000000..ec2861a0ee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/workflow.xml @@ -0,0 +1,111 @@ + + + + sourceNN + the source name node + + + isLookupUrl + the isLookup service endpoint + + + workingDirectory + /tmp/actionsets + working directory + + + distcp_memory_mb + 6144 + memory for distcp copying actionsets from remote cluster + + + distcp_task_timeout + 60000000 + timeout for distcp copying actions from remote cluster + + + distcp_num_maps + 1 + mmaximum number of map tasks used in the distcp process + + + transform_only + activate tranform-only mode. Only apply transformation step + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.actions.MigrateActionSet + -Dmapred.task.timeout=${distcp_task_timeout} + -is${isLookupUrl} + -sn${sourceNN} + -tn${nameNode} + -w${workingDirectory} + -nm${distcp_num_maps} + -mm${distcp_memory_mb} + -tt${distcp_task_timeout} + -tr${transform_only} + + + + + + + + + ${jobTracker} + ${nameNode} + yarn + cluster + transform_actions + eu.dnetlib.dhp.migration.actions.TransformActions + dhp-aggregation-${projectVersion}.jar + + --executor-cores ${sparkExecutorCores} + --executor-memory ${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + -mtyarn + -is${isLookupUrl} + --inputPaths${wf:actionData('migrate_actionsets')['target_paths']} + + + + + + + migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/config-default.xml similarity index 80% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/config-default.xml rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/config-default.xml index 51e48d8f75..2e0ed9aeea 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/config-default.xml @@ -15,8 +15,4 @@ oozie.action.sharelib.for.spark spark2 - - hdfsUser - dnet - \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/workflow.xml new file mode 100644 index 0000000000..1ac456976d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/workflow.xml @@ -0,0 +1,169 @@ + + + + migrationClaimsPathStep1 + the base path to store hdfs file + + + migrationClaimsPathStep2 + the temporary path to store entities before dispatching + + + migrationClaimsPathStep3 + the graph Raw base path + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + mongoURL + mongoDB url, example: mongodb://[username:password@]host[:port] + + + mongoDb + mongo database + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication + -p${migrationClaimsPathStep1}/db_claims + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + -aclaims + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationClaimsPathStep1}/odf_claims + -mongourl${mongoURL} + -mongodb${mongoDb} + -fODF + -lstore + -iclaim + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationClaimsPathStep1}/oaf_claims + -mongourl${mongoURL} + -mongodb${mongoDb} + -fOAF + -lstore + -iclaim + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateClaimEntities + eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationClaimsPathStep1}/db_claims,${migrationClaimsPathStep1}/oaf_claims,${migrationClaimsPathStep1}/odf_claims + -t${migrationClaimsPathStep2}/claim_entities + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateClaimGraph + eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationClaimsPathStep2}/claim_entities + -g${migrationClaimsPathStep3} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/workflow.xml new file mode 100644 index 0000000000..39807dd365 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/workflow.xml @@ -0,0 +1,168 @@ + + + + migrationPathStep1 + the base path to store hdfs file + + + migrationPathStep2 + the temporary path to store entities before dispatching + + + migrationPathStep3 + the graph Raw base path + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + mongoURL + mongoDB url, example: mongodb://[username:password@]host[:port] + + + mongoDb + mongo database + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication + -p${migrationPathStep1}/db_records + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationPathStep1}/odf_records + -mongourl${mongoURL} + -mongodb${mongoDb} + -fODF + -lstore + -icleaned + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationPathStep1}/oaf_records + -mongourl${mongoURL} + -mongodb${mongoDb} + -fOAF + -lstore + -icleaned + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateEntities + eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records + -t${migrationPathStep2}/all_entities + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateGraph + eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationPathStep2}/all_entities + -g${migrationPathStep3} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/workflow.xml new file mode 100644 index 0000000000..f16e22f957 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/workflow.xml @@ -0,0 +1,103 @@ + + + + migrationPathStep1 + the base path to store hdfs file + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + mongoURL + mongoDB url, example: mongodb://[username:password@]host[:port] + + + mongoDb + mongo database + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication + -p${migrationPathStep1}/db_records + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationPathStep1}/odf_records + -mongourl${mongoURL} + -mongodb${mongoDb} + -fODF + -lstore + -icleaned + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationPathStep1}/oaf_records + -mongourl${mongoURL} + -mongodb${mongoDb} + -fOAF + -lstore + -icleaned + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/workflow.xml new file mode 100644 index 0000000000..cd0a4025e8 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/workflow.xml @@ -0,0 +1,74 @@ + + + + migrationPathStep1 + the base path to store hdfs file + + + migrationPathStep2 + the temporary path to store entities before dispatching + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateEntities + eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records + -t${migrationPathStep2}/all_entities + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/workflow.xml new file mode 100644 index 0000000000..8688f09d18 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/workflow.xml @@ -0,0 +1,60 @@ + + + + + migrationPathStep2 + the temporary path to store entities before dispatching + + + migrationPathStep3 + the graph Raw base path + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateGraph + eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationPathStep2}/all_entities + -g${migrationPathStep3} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java index 848fbe17da..b367491e52 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java @@ -1,79 +1,87 @@ package eu.dnetlib.dhp.collection; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.junit.*; - import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; + public class CollectionJobTest { - private Path testDir; - @Before - public void setup() throws IOException { - testDir = Files.createTempDirectory("dhp-collection"); - } + private Path testDir; - @After - public void teadDown() throws IOException { - FileUtils.deleteDirectory(testDir.toFile()); - } + @Before + public void setup() throws IOException { + testDir = Files.createTempDirectory("dhp-collection"); + } - @Test - public void tesCollection() throws Exception { - Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); - GenerateNativeStoreSparkJob.main(new String[] { - "-mt", "local", - "-w", "wid", - "-e", "XML", - "-d", ""+System.currentTimeMillis(), - "-p", new ObjectMapper().writeValueAsString(provenance), - "-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), - "-o", testDir.toString()+"/store", - "-t", "true", - "-ru", "", - "-rp", "", - "-rh", "", - "-ro", "", - "-rr", ""}); - System.out.println(new ObjectMapper().writeValueAsString(provenance)); - } + @After + public void teadDown() throws IOException { + FileUtils.deleteDirectory(testDir.toFile()); + } + @Test + public void tesCollection() throws Exception { + final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); + GenerateNativeStoreSparkJob.main(new String[] { + "-mt", "local", + "-w", "wid", + "-e", "XML", + "-d", "" + System.currentTimeMillis(), + "-p", new ObjectMapper().writeValueAsString(provenance), + "-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), + "-o", testDir.toString() + "/store", + "-t", "true", + "-ru", "", + "-rp", "", + "-rh", "", + "-ro", "", + "-rr", "" }); + System.out.println(new ObjectMapper().writeValueAsString(provenance)); + } + @Test + public void testGenerationMetadataRecord() throws Exception { - @Test - public void testGenerationMetadataRecord() throws Exception { + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", + "ns_prefix"), System.currentTimeMillis(), null, null); - MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); + assert record != null; + System.out.println(record.getId()); + System.out.println(record.getOriginalId()); - assert record != null; - System.out.println(record.getId()); - System.out.println(record.getOriginalId()); + } + @Test + public void TestEquals() throws IOException { - } + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", + "ns_prefix"), System.currentTimeMillis(), null, null); + final MetadataRecord record1 = GenerateNativeStoreSparkJob + .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", + "ns_prefix"), System.currentTimeMillis(), null, null); + assert record != null; + record.setBody("ciao"); + assert record1 != null; + record1.setBody("mondo"); + Assert.assertEquals(record, record1); - - @Test - public void TestEquals () throws IOException { - - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); - MetadataRecord record1 = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); - assert record != null; - record.setBody("ciao"); - assert record1 != null; - record1.setBody("mondo"); - Assert.assertEquals(record, record1); - - } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java index 7c0967b2e7..0291be47ef 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java @@ -18,13 +18,13 @@ public class GraphMappingUtils { public final static Map types = Maps.newHashMap(); static { - types.put("datasource", Datasource.class); - types.put("organization", Organization.class); + types.put("datasource", Datasource.class); + types.put("organization", Organization.class); types.put("project", Project.class); - types.put("dataset", Dataset.class); - types.put("otherresearchproduct", OtherResearchProduct.class); - types.put("software", Software.class); - types.put("publication", Publication.class); + types.put("dataset", Dataset.class); + types.put("otherresearchproduct", OtherResearchProduct.class); + types.put("software", Software.class); + types.put("publication", Publication.class); types.put("relation", Relation.class); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java index c5223c1f68..95c3cd4800 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.graph; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; @@ -13,31 +13,40 @@ public class SparkGraphImporterJob { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession + + try(SparkSession spark = getSparkSession(parser)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String hiveDbName = parser.get("hive_db_name"); + + spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName)); + spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); + + // Read the input file and convert it into RDD of serializable object + GraphMappingUtils.types.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name) + .map(s -> new ObjectMapper().readValue(s, clazz)) + .rdd(), Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(hiveDbName + "." + name)); + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + return SparkSession .builder() .appName(SparkGraphImporterJob.class.getSimpleName()) .master(parser.get("master")) - .config("hive.metastore.uris", parser.get("hive_metastore_uris")) + .config(conf) .enableHiveSupport() .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String hiveDbName = parser.get("hive_db_name"); - - spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); - - // Read the input file and convert it into RDD of serializable object - GraphMappingUtils.types.forEach((name, clazz) -> { - spark.createDataset(sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class) - .map(s -> new ObjectMapper().readValue(s._2().toString(), clazz)) - .rdd(), Encoders.bean(clazz)) - .write() - .mode(SaveMode.Overwrite) - .saveAsTable(hiveDbName + "." + name); - }); - } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/hive/postprocessing.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/hive/postprocessing.sql new file mode 100644 index 0000000000..26fcbacf5f --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/hive/postprocessing.sql @@ -0,0 +1,8 @@ +CREATE view result as + select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.publication p + union all + select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.dataset d + union all + select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.software s + union all + select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.otherresearchproduct o; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml index 24090a2458..7a883db4c4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -37,12 +37,30 @@ MapGraphIntoDataFrame eu.dnetlib.dhp.graph.SparkGraphImporterJob dhp-graph-mapper-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster --sourcePath${sourcePath} --hive_db_name${hive_db_name} --hive_metastore_uris${hive_metastore_uris} + + + + + + + ${jobTracker} + ${nameNode} + + hive_db_name=${hive_db_name} + diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties index b5ab079825..68816c2240 100644 --- a/dhp-workflows/dhp-graph-provision/job-override.properties +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -1,10 +1,12 @@ -sparkDriverMemory=8G -sparkExecutorMemory=8G +sparkDriverMemory=10G +sparkExecutorMemory=15G #isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03 outputPath=/tmp/openaire_provision format=TMF batchSize=2000 +sparkExecutorCoresForJoining=128 sparkExecutorCoresForIndexing=64 -reuseRecords=true \ No newline at end of file +reuseRecords=false +otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java index 062c8886b8..29492cb907 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -1,31 +1,32 @@ package eu.dnetlib.dhp.graph; -import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.JsonPath; import eu.dnetlib.dhp.graph.model.*; import eu.dnetlib.dhp.graph.utils.ContextMapper; import eu.dnetlib.dhp.graph.utils.GraphMappingUtils; +import eu.dnetlib.dhp.graph.utils.RelationPartitioner; import eu.dnetlib.dhp.graph.utils.XmlRecordFactory; import eu.dnetlib.dhp.schema.oaf.*; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.SparkSession; +import org.apache.spark.util.LongAccumulator; import scala.Tuple2; import java.io.IOException; import java.io.Serializable; -import java.util.HashSet; -import java.util.List; -import java.util.stream.Collectors; +import java.util.Map; import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity; @@ -45,10 +46,12 @@ import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity; * 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S * and E_target = T. Objects in T are heavily pruned by all the unnecessary information * - * 4) perform the join as (((T join R) union S) groupby S.id) yield S -> [ ] + * 4) perform the join as (((T.id join R.target) union S) groupby S.id) yield S -> [ ] */ public class GraphJoiner implements Serializable { + private Map accumulators = Maps.newHashMap(); + public static final int MAX_RELS = 100; public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; @@ -61,24 +64,30 @@ public class GraphJoiner implements Serializable { private String outPath; - public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String inputPath, String outPath) { + private String otherDsTypeId; + + public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String otherDsTypeId, String inputPath, String outPath) { this.spark = spark; this.contextMapper = contextMapper; + this.otherDsTypeId = otherDsTypeId; this.inputPath = inputPath; this.outPath = outPath; + + final SparkContext sc = spark.sparkContext(); + prepareAccumulators(sc); } public GraphJoiner adjacencyLists() { - final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext()); + final JavaSparkContext jsc = new JavaSparkContext(getSpark().sparkContext()); // read each entity - JavaPairRDD datasource = readPathEntity(sc, getInputPath(), "datasource"); - JavaPairRDD organization = readPathEntity(sc, getInputPath(), "organization"); - JavaPairRDD project = readPathEntity(sc, getInputPath(), "project"); - JavaPairRDD dataset = readPathEntity(sc, getInputPath(), "dataset"); - JavaPairRDD otherresearchproduct = readPathEntity(sc, getInputPath(), "otherresearchproduct"); - JavaPairRDD software = readPathEntity(sc, getInputPath(), "software"); - JavaPairRDD publication = readPathEntity(sc, getInputPath(), "publication"); + JavaPairRDD datasource = readPathEntity(jsc, getInputPath(), "datasource"); + JavaPairRDD organization = readPathEntity(jsc, getInputPath(), "organization"); + JavaPairRDD project = readPathEntity(jsc, getInputPath(), "project"); + JavaPairRDD dataset = readPathEntity(jsc, getInputPath(), "dataset"); + JavaPairRDD otherresearchproduct = readPathEntity(jsc, getInputPath(), "otherresearchproduct"); + JavaPairRDD software = readPathEntity(jsc, getInputPath(), "software"); + JavaPairRDD publication = readPathEntity(jsc, getInputPath(), "publication"); // create the union between all the entities final String entitiesPath = getOutPath() + "/entities"; @@ -93,31 +102,43 @@ public class GraphJoiner implements Serializable { .map(GraphMappingUtils::serialize) .saveAsTextFile(entitiesPath, GzipCodec.class); - JavaPairRDD entities = sc.textFile(entitiesPath) + JavaPairRDD entities = jsc.textFile(entitiesPath) .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) .mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t)); + final String relationPath = getOutPath() + "/relation"; // reads the relationships - final JavaPairRDD relation = readPathRelation(sc, getInputPath()) - .filter(r -> !r.getDeleted()) //only consider those that are not virtually deleted + final JavaPairRDD rels = readPathRelation(jsc, getInputPath()) + .filter(rel -> !rel.getDeleted()) //only consider those that are not virtually deleted .map(p -> new EntityRelEntity().setRelation(p)) - .mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p)) - .groupByKey() + .mapToPair(p -> new Tuple2<>(SortableRelationKey.from(p), p)); + rels + .groupByKey(new RelationPartitioner(rels.getNumPartitions())) .map(p -> Iterables.limit(p._2(), MAX_RELS)) .flatMap(p -> p.iterator()) + .map(s -> new ObjectMapper().writeValueAsString(s)) + .saveAsTextFile(relationPath, GzipCodec.class); + + final JavaPairRDD relation = jsc.textFile(relationPath) + .map(s -> new ObjectMapper().readValue(s, EntityRelEntity.class)) .mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p)); - //final String bySource = getOutPath() + "/1_join_by_target"; - JavaPairRDD bySource = relation + final String bySourcePath = getOutPath() + "/join_by_source"; + relation .join(entities .filter(e -> !e._2().getSource().getDeleted()) .mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2())))) .map(s -> new EntityRelEntity() .setRelation(s._2()._1().getRelation()) .setTarget(s._2()._2().getSource())) + .map(j -> new ObjectMapper().writeValueAsString(j)) + .saveAsTextFile(bySourcePath, GzipCodec.class); + + JavaPairRDD bySource = jsc.textFile(bySourcePath) + .map(e -> getObjectMapper().readValue(e, EntityRelEntity.class)) .mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t)); - final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, false, schemaLocation, new HashSet<>()); + final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId); entities .union(bySource) .groupByKey() // by source id @@ -130,20 +151,6 @@ public class GraphJoiner implements Serializable { return this; } - public GraphJoiner asXML() { - final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext()); - final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, true, "", new HashSet<>()); - final ObjectMapper mapper = new ObjectMapper(); - - final String joinedEntitiesPath = getOutPath() + "/1_joined_entities"; - sc.textFile(joinedEntitiesPath) - .map(s -> mapper.readValue(s, JoinedEntity.class)) - .mapToPair(je -> new Tuple2<>(new Text(je.getEntity().getId()), new Text(recordFactory.build(je)))) - .saveAsHadoopFile(getOutPath() + "/2_xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); - - return this; - } - public SparkSession getSpark() { return spark; } @@ -158,24 +165,23 @@ public class GraphJoiner implements Serializable { // HELPERS - private OafEntity parseOaf(final String json, final String type) { - final ObjectMapper o = new ObjectMapper(); + private OafEntity parseOaf(final String json, final String type, final ObjectMapper mapper) { try { switch (GraphMappingUtils.EntityType.valueOf(type)) { case publication: - return o.readValue(json, Publication.class); + return mapper.readValue(json, Publication.class); case dataset: - return o.readValue(json, Dataset.class); + return mapper.readValue(json, Dataset.class); case otherresearchproduct: - return o.readValue(json, OtherResearchProduct.class); + return mapper.readValue(json, OtherResearchProduct.class); case software: - return o.readValue(json, Software.class); + return mapper.readValue(json, Software.class); case datasource: - return o.readValue(json, Datasource.class); + return mapper.readValue(json, Datasource.class); case organization: - return o.readValue(json, Organization.class); + return mapper.readValue(json, Organization.class); case project: - return o.readValue(json, Project.class); + return mapper.readValue(json, Project.class); default: throw new IllegalArgumentException("invalid type: " + type); } @@ -185,26 +191,26 @@ public class GraphJoiner implements Serializable { } private JoinedEntity toJoinedEntity(Tuple2> p) { - final ObjectMapper o = new ObjectMapper(); + final ObjectMapper mapper = getObjectMapper(); final JoinedEntity j = new JoinedEntity(); - final Links links2 = new Links(); + final Links links = new Links(); for(EntityRelEntity rel : p._2()) { if (rel.hasMainEntity() & j.getEntity() == null) { j.setType(rel.getSource().getType()); - j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType())); + j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType(), mapper)); } if (rel.hasRelatedEntity()) { try { - links2.add( + links.add( new eu.dnetlib.dhp.graph.model.Tuple2() - .setRelation(o.readValue(rel.getRelation().getOaf(), Relation.class)) - .setRelatedEntity(o.readValue(rel.getTarget().getOaf(), RelatedEntity.class))); + .setRelation(mapper.readValue(rel.getRelation().getOaf(), Relation.class)) + .setRelatedEntity(mapper.readValue(rel.getTarget().getOaf(), RelatedEntity.class))); } catch (IOException e) { throw new IllegalArgumentException(e); } } } - j.setLinks(links2); + j.setLinks(links); if (j.getEntity() == null) { throw new IllegalStateException("missing main entity on '" + p._1() + "'"); } @@ -250,8 +256,38 @@ public class GraphJoiner implements Serializable { .setTargetId(json.read("$.target")) .setDeleted(json.read("$.dataInfo.deletedbyinference")) .setType("relation") + .setRelType("$.relType") + .setSubRelType("$.subRelType") + .setRelClass("$.relClass") .setOaf(s); }); } + private ObjectMapper getObjectMapper() { + return new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + } + + private void prepareAccumulators(SparkContext sc) { + accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments")); + accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments")); + accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo")); + accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy")); + accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn")); + accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges")); + + accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo")); + accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo")); + accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy")); + accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces")); + accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf")); + + accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution")); + accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant")); + accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant")); + accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn")); + accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces")); + accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy")); + accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides")); + } + } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java index 0b2180f190..5fa3e63850 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java @@ -24,6 +24,7 @@ public class SparkXmlRecordBuilderJob { final String inputPath = parser.get("sourcePath"); final String outputPath = parser.get("outputPath"); final String isLookupUrl = parser.get("isLookupUrl"); + final String otherDsTypeId = parser.get("otherDsTypeId"); final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); if (fs.exists(new Path(outputPath))) { @@ -31,8 +32,9 @@ public class SparkXmlRecordBuilderJob { fs.mkdirs(new Path(outputPath)); } - new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), inputPath, outputPath) + new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath) .adjacencyLists(); + //.asXML(); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/SortableRelationKey.java new file mode 100644 index 0000000000..6bfbab5471 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/SortableRelationKey.java @@ -0,0 +1,99 @@ +package eu.dnetlib.dhp.graph.model; + +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Maps; + +import java.io.Serializable; +import java.util.Map; + +/** + * Allows to sort relationships according to the priority defined in weights map. + */ +public class SortableRelationKey implements Comparable, Serializable { + + private String sourceId; + private String targetId; + + private String relType; + private String subRelType; + private String relClass; + + private final static Map weights = Maps.newHashMap(); + + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("publicationDataset", 2); + weights.put("relationship", 3); + weights.put("similarity", 4); + weights.put("affiliation", 5); + + weights.put("provision", 6); + weights.put("participation", 7); + weights.put("dedup", 8); + } + + public static SortableRelationKey from(final EntityRelEntity e) { + return new SortableRelationKey() + .setSourceId(e.getRelation().getSourceId()) + .setTargetId(e.getRelation().getTargetId()) + .setRelType(e.getRelation().getRelType()) + .setSubRelType(e.getRelation().getSubRelType()) + .setRelClass(e.getRelation().getRelClass()); + } + + public String getSourceId() { + return sourceId; + } + + public SortableRelationKey setSourceId(String sourceId) { + this.sourceId = sourceId; + return this; + } + + public String getTargetId() { + return targetId; + } + + public SortableRelationKey setTargetId(String targetId) { + this.targetId = targetId; + return this; + } + + public String getRelType() { + return relType; + } + + public SortableRelationKey setRelType(String relType) { + this.relType = relType; + return this; + } + + public String getSubRelType() { + return subRelType; + } + + public SortableRelationKey setSubRelType(String subRelType) { + this.subRelType = subRelType; + return this; + } + + public String getRelClass() { + return relClass; + } + + public SortableRelationKey setRelClass(String relClass) { + this.relClass = relClass; + return this; + } + + @Override + public int compareTo(SortableRelationKey o) { + return ComparisonChain.start() + .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType())) + .compare(getSourceId(), o.getSourceId()) + .compare(getTargetId(), o.getTargetId()) + .result(); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java index 3651e28c9d..8205c38ef7 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java @@ -12,6 +12,10 @@ public class TypedRow implements Serializable { private String type; + private String relType; + private String subRelType; + private String relClass; + private String oaf; public String getSourceId() { @@ -50,6 +54,33 @@ public class TypedRow implements Serializable { return this; } + public String getRelType() { + return relType; + } + + public TypedRow setRelType(String relType) { + this.relType = relType; + return this; + } + + public String getSubRelType() { + return subRelType; + } + + public TypedRow setSubRelType(String subRelType) { + this.subRelType = subRelType; + return this; + } + + public String getRelClass() { + return relClass; + } + + public TypedRow setRelClass(String relClass) { + this.relClass = relClass; + return this; + } + public String getOaf() { return oaf; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java index 0921fe1050..3d8cde703e 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java @@ -26,6 +26,8 @@ import static org.apache.commons.lang3.StringUtils.*; public class GraphMappingUtils { + public static final String SEPARATOR = "_"; + public enum EntityType { publication, dataset, otherresearchproduct, software, datasource, organization, project } @@ -38,34 +40,6 @@ public class GraphMappingUtils { public static Set instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation"); - private static BiMap relClassMapping = HashBiMap.create(); - - static { - relClassMapping.put("isAuthorInstitutionOf", "hasAuthorInstitution"); - relClassMapping.put("isMergedIn", "merges"); - relClassMapping.put("isProducedBy", "produces"); - relClassMapping.put("hasParticipant", "isParticipant"); - relClassMapping.put("isProvidedBy", "provides"); - relClassMapping.put("isRelatedTo", "isRelatedTo"); - relClassMapping.put("isAmongTopNSimilarDocuments", "hasAmongTopNSimilarDocuments"); - relClassMapping.put("isRelatedTo", "isRelatedTo"); - relClassMapping.put("isSupplementTo", "isSupplementedBy"); - } - - public static String getInverseRelClass(final String relClass) { - String res = relClassMapping.get(relClass); - if (isNotBlank(res)) { - return res; - } - res = relClassMapping.inverse().get(relClass); - - if (isNotBlank(res)) { - return res; - } - - throw new IllegalArgumentException("unable to find an inverse relationship class for term: " + relClass); - } - private static final String schemeTemplate = "dnet:%s_%s_relations"; private static Map entityMapping = Maps.newHashMap(); @@ -158,7 +132,7 @@ public class GraphMappingUtils { re.setLegalname(j.read("$.legalname.value")); re.setLegalshortname(j.read("$.legalshortname.value")); re.setCountry(asQualifier(j.read("$.country"))); - + re.setWebsiteurl(j.read("$.websiteurl.value")); break; case project: re.setProjectTitle(j.read("$.title.value")); @@ -250,5 +224,8 @@ public class GraphMappingUtils { return s; } + public static String getRelDescriptor(String relType, String subRelType, String relClass) { + return relType + SEPARATOR + subRelType + SEPARATOR + relClass; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/RelationPartitioner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/RelationPartitioner.java new file mode 100644 index 0000000000..f4b1514d0e --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/RelationPartitioner.java @@ -0,0 +1,29 @@ +package eu.dnetlib.dhp.graph.utils; + +import eu.dnetlib.dhp.graph.model.SortableRelationKey; +import org.apache.spark.Partitioner; +import org.apache.spark.util.Utils; + +/** + * Used in combination with SortableRelationKey, allows to partition the records by source id, therefore + * allowing to sort relations sharing the same source id by the ordering defined in SortableRelationKey. + */ +public class RelationPartitioner extends Partitioner { + + private int numPartitions; + + public RelationPartitioner(int numPartitions) { + this.numPartitions = numPartitions; + } + + @Override + public int numPartitions() { + return numPartitions; + } + + @Override + public int getPartition(Object key) { + return Utils.nonNegativeMod(((SortableRelationKey) key).getSourceId().hashCode(), numPartitions()); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java index abcf2a7ecb..74e36a818c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.graph.utils; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.mycila.xmltool.XMLDoc; import com.mycila.xmltool.XMLTag; @@ -11,6 +12,8 @@ import eu.dnetlib.dhp.graph.model.RelatedEntity; import eu.dnetlib.dhp.graph.model.Tuple2; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; @@ -27,6 +30,7 @@ import java.io.Serializable; import java.io.StringReader; import java.io.StringWriter; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -37,37 +41,49 @@ import static org.apache.commons.lang3.StringUtils.substringBefore; public class XmlRecordFactory implements Serializable { + private Map accumulators; + private Set specialDatasourceTypes; private ContextMapper contextMapper; private String schemaLocation; - private Set contextes = Sets.newHashSet(); - private boolean indent = false; public XmlRecordFactory( final ContextMapper contextMapper, final boolean indent, - final String schemaLocation, final Set otherDatasourceTypesUForUI) { + final String schemaLocation, final String otherDatasourceTypesUForUI) { + this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI); + } + + public XmlRecordFactory( + final Map accumulators, + final ContextMapper contextMapper, final boolean indent, + final String schemaLocation, final String otherDatasourceTypesUForUI) { + + this.accumulators = accumulators; this.contextMapper = contextMapper; this.schemaLocation = schemaLocation; - this.specialDatasourceTypes = otherDatasourceTypesUForUI; + this.specialDatasourceTypes = Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI)); this.indent = indent; } public String build(final JoinedEntity je) { + + final Set contexts = Sets.newHashSet(); + final OafEntity entity = je.getEntity(); TemplateFactory templateFactory = new TemplateFactory(); try { - final List metadata = metadata(je.getType(), entity); + final List metadata = metadata(je.getType(), entity, contexts); // rels has to be processed before the contexts because they enrich the contextMap with the funding info. - final List relations = listRelations(je, templateFactory); + final List relations = listRelations(je, templateFactory, contexts); - metadata.addAll(buildContexts(getMainType(je.getType()))); + metadata.addAll(buildContexts(getMainType(je.getType()), contexts)); metadata.add(parseDataInfo(entity.getDataInfo())); final String body = templateFactory.buildBody( @@ -97,10 +113,11 @@ public class XmlRecordFactory implements Serializable { } } - private List metadata(final String type, final OafEntity entity) { + private List metadata(final String type, final OafEntity entity, final Set contexts) { final List metadata = Lists.newArrayList(); + if (entity.getCollectedfrom() != null) { metadata.addAll(entity.getCollectedfrom() .stream() @@ -123,6 +140,17 @@ public class XmlRecordFactory implements Serializable { if (GraphMappingUtils.isResult(type)) { final Result r = (Result) entity; + if (r.getContext() != null) { + contexts.addAll(r.getContext() + .stream() + .map(c -> c.getId()) + .collect(Collectors.toList())); + /* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */ + if (contexts.contains("dh-ch::subcommunity::2")) { + contexts.add("clarin"); + } + } + if (r.getTitle() != null) { metadata.addAll(r.getTitle() .stream() @@ -235,16 +263,6 @@ public class XmlRecordFactory implements Serializable { } metadata.add(mapQualifier("bestaccessright", getBestAccessright(r))); - - if (r.getContext() != null) { - contextes.addAll(r.getContext() - .stream() - .map(c -> c.getId()) - .collect(Collectors.toList())); - if (contextes.contains("dh-ch::subcommunity::2")) { - contextes.add("clarin"); - } - } } switch (EntityType.valueOf(type)) { @@ -445,7 +463,7 @@ public class XmlRecordFactory implements Serializable { if (ds.getSubjects() != null) { metadata.addAll(ds.getSubjects() .stream() - .map(sp -> mapStructuredProperty("subject", sp)) + .map(sp -> mapStructuredProperty("subjects", sp)) .collect(Collectors.toList())); } @@ -580,7 +598,7 @@ public class XmlRecordFactory implements Serializable { if (p.getFundingtree() != null) { metadata.addAll(p.getFundingtree() .stream() - .map(ft -> asXmlElement("fundingtree", ft.getValue())) + .map(ft -> ft.getValue()) .collect(Collectors.toList())); } @@ -618,7 +636,7 @@ public class XmlRecordFactory implements Serializable { return bestAccessRight; } - private List listRelations(final JoinedEntity je, TemplateFactory templateFactory) { + private List listRelations(final JoinedEntity je, TemplateFactory templateFactory, final Set contexts) { final List rels = Lists.newArrayList(); for (final Tuple2 link : je.getLinks()) { @@ -699,7 +717,7 @@ public class XmlRecordFactory implements Serializable { if (re.getFundingtree() != null) { metadata.addAll(re.getFundingtree() .stream() - .peek(ft -> fillContextMap(ft)) + .peek(ft -> fillContextMap(ft, contexts)) .map(ft -> getRelFundingTree(ft)) .collect(Collectors.toList())); } @@ -709,13 +727,23 @@ public class XmlRecordFactory implements Serializable { } final DataInfo info = rel.getDataInfo(); + final String scheme = getScheme(re.getType(), targetType); + + if (StringUtils.isBlank(scheme)) { + throw new IllegalArgumentException(String.format("missing scheme for: <%s - %s>", re.getType(), targetType)); + } + + final String accumulatorName = getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass()); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(1); + } rels.add(templateFactory.getRel( targetType, rel.getTarget(), Sets.newHashSet(metadata), - getInverseRelClass(rel.getRelClass()), - getScheme(targetType, re.getType()), + rel.getRelClass(), + scheme, info)); } return rels; @@ -807,14 +835,14 @@ public class XmlRecordFactory implements Serializable { .collect(Collectors.toList()) : Lists.newArrayList(); } - private List buildContexts(final String type) { + private List buildContexts(final String type, final Set contexts) { final List res = Lists.newArrayList(); if ((contextMapper != null) && !contextMapper.isEmpty() && MainEntityType.result.toString().equals(type)) { XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); - for (final String context : contextes) { + for (final String context : contexts) { String id = ""; for (final String token : Splitter.on("::").split(context)) { @@ -882,7 +910,7 @@ public class XmlRecordFactory implements Serializable { return buffer.toString(); } - private void fillContextMap(final String xmlTree) { + private void fillContextMap(final String xmlTree, final Set contexts) { Document fundingPath; try { @@ -896,7 +924,7 @@ public class XmlRecordFactory implements Serializable { if (funder != null) { final String funderShortName = funder.valueOf("./shortname"); - contextes.add(funderShortName); + contexts.add(funderShortName); contextMapper.put(funderShortName, new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding")); final Node level0 = fundingPath.selectSingleNode("//funding_level_0"); @@ -905,17 +933,17 @@ public class XmlRecordFactory implements Serializable { contextMapper.put(level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", "")); final Node level1 = fundingPath.selectSingleNode("//funding_level_1"); if (level1 == null) { - contextes.add(level0Id); + contexts.add(level0Id); } else { final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name")); contextMapper.put(level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", "")); final Node level2 = fundingPath.selectSingleNode("//funding_level_2"); if (level2 == null) { - contextes.add(level1Id); + contexts.add(level1Id); } else { final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name")); contextMapper.put(level2Id, new ContextDef(level2Id, level2.valueOf("./description"), "concept", "")); - contextes.add(level2Id); + contexts.add(level2Id); } } } @@ -928,7 +956,7 @@ public class XmlRecordFactory implements Serializable { @SuppressWarnings("unchecked") - private String getRelFundingTree(final String xmlTree) { + protected static String getRelFundingTree(final String xmlTree) { String funding = ""; try { final Document ftree = new SAXReader().read(new StringReader(xmlTree)); @@ -949,11 +977,11 @@ public class XmlRecordFactory implements Serializable { return funding; } - private String getFunderElement(final Document ftree) { - final String funderId = ftree.valueOf("//fundingtree/funder/id/text()"); - final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname/text()"); - final String funderName = ftree.valueOf("//fundingtree/funder/name/text()"); - final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction/text()"); + private static String getFunderElement(final Document ftree) { + final String funderId = ftree.valueOf("//fundingtree/funder/id"); + final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname"); + final String funderName = ftree.valueOf("//fundingtree/funder/name"); + final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction"); return ""; diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json index e63322028c..a5d20a55f4 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json @@ -1,6 +1,7 @@ [ - {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, - {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, - {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true}, - {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true} + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, + {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true}, + {"paramName":"t", "paramLongName":"otherDsTypeId", "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml index 3503589441..3132ae9407 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -50,9 +50,10 @@ eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob dhp-graph-provision-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} + --executor-cores ${sparkExecutorCoresForJoining} + --executor-memory ${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForJoining} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -60,6 +61,7 @@ -mt yarn -is ${isLookupUrl} + -t ${otherDsTypeId} --sourcePath${sourcePath} --outputPath${outputPath} @@ -77,8 +79,9 @@ eu.dnetlib.dhp.graph.SparkXmlIndexingJob dhp-graph-provision-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} + --executor-cores ${sparkExecutorCoresForIndexing} + --executor-memory ${sparkExecutorMemoryForIndexing} + --driver-memory=${sparkDriverMemoryForIndexing} --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/GraphJoinerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/GraphJoinerTest.java new file mode 100644 index 0000000000..147ac801cd --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/GraphJoinerTest.java @@ -0,0 +1,38 @@ +package eu.dnetlib.dhp.graph; + +import org.junit.Before; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class GraphJoinerTest { + + private ClassLoader cl = getClass().getClassLoader(); + private Path workingDir; + private Path inputDir; + private Path outputDir; + + @Before + public void before() throws IOException { + workingDir = Files.createTempDirectory("promote_action_set"); + inputDir = workingDir.resolve("input"); + outputDir = workingDir.resolve("output"); + } + + private static void copyFiles(Path source, Path target) throws IOException { + Files.list(source).forEach(f -> { + try { + if (Files.isDirectory(f)) { + Path subTarget = Files.createDirectories(target.resolve(f.getFileName())); + copyFiles(f, subTarget); + } else { + Files.copy(f, target.resolve(f.getFileName())); + } + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + }); + } +} diff --git a/pom.xml b/pom.xml index faed1db356..0310a3f44e 100644 --- a/pom.xml +++ b/pom.xml @@ -76,7 +76,7 @@ junit junit - 4.12 + ${junit.version} test @@ -110,6 +110,12 @@ ${dhp.hadoop.version} provided + + org.apache.hadoop + hadoop-distcp + ${dhp.hadoop.version} + provided + org.apache.spark spark-core_2.11 @@ -262,6 +268,16 @@ provided + + eu.dnetlib + dnet-actionmanager-common + 6.0.5 + + + eu.dnetlib + dnet-openaire-data-protos + 3.9.8-proto250 + eu.dnetlib dnet-pace-core @@ -481,6 +497,7 @@ 2.9.6 3.5 2.11.12 + 4.12 3.4.2