forked from D-Net/dnet-hadoop
Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop
This commit is contained in:
commit
201d79021e
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||
|
@ -102,7 +102,7 @@
|
|||
</goals>
|
||||
</pluginExecutionFilter>
|
||||
<action>
|
||||
<ignore></ignore>
|
||||
<ignore />
|
||||
</action>
|
||||
</pluginExecution>
|
||||
</pluginExecutions>
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-dedup-openaire</artifactId>
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
----------------------------------------------------------------
|
||||
Thu Mar 26 19:43:00 CET 2020:
|
||||
Booting Derby version The Apache Software Foundation - Apache Derby - 10.12.1.1 - (1704137): instance a816c00e-0171-1827-9724-000012c70f40
|
||||
on database directory /private/var/folders/xn/nr5vdk8n1572rvrnx5890_d80000gn/T/junit3871072562876431144/junit_metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@4e6b5ed4
|
||||
Loaded from file:/Users/claudio/.m2/repository/org/apache/derby/derby/10.12.1.1/derby-10.12.1.1.jar
|
||||
java.vendor=Oracle Corporation
|
||||
java.runtime.version=1.8.0_181-b13
|
||||
user.dir=/Users/claudio/workspace/git/dnet-hadoop/dhp-workflows/dhp-graph-mapper
|
||||
os.name=Mac OS X
|
||||
os.arch=x86_64
|
||||
os.version=10.15.3
|
||||
derby.system.home=null
|
||||
Database Class Loader started - derby.database.classpath=''
|
|
@ -1,10 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
sparkExecutorCoresForJoining=1
|
||||
sparkDriverMemoryForJoining=10G
|
||||
sparkExecutorMemoryForJoining=15G
|
||||
sparkExecutorCoresForIndexing=64
|
||||
sparkDriverMemoryForIndexing=3G
|
||||
sparkExecutorMemoryForIndexing=2G
|
||||
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
|
||||
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
|
||||
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
|
||||
outputPath=/tmp/openaire_provision
|
||||
format=TMF
|
||||
batchSize=2000
|
||||
reuseRecords=false
|
||||
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -29,14 +29,9 @@ public class SparkXmlRecordBuilderJob {
|
|||
final String otherDsTypeId = parser.get("otherDsTypeId");
|
||||
|
||||
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||
if (fs.exists(new Path(outputPath))) {
|
||||
fs.delete(new Path(outputPath), true);
|
||||
fs.mkdirs(new Path(outputPath));
|
||||
}
|
||||
|
||||
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
|
||||
.adjacencyLists();
|
||||
//.asXML();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -7,17 +7,17 @@ import java.nio.charset.StandardCharsets;
|
|||
|
||||
public class TemplateResources {
|
||||
|
||||
private String record = read("eu/dnetlib/dhp/graph/template/record.st");
|
||||
private String record = read("eu/dnetlib/dhp/oa/provision/template/record.st");
|
||||
|
||||
private String instance = read("eu/dnetlib/dhp/graph/template/instance.st");
|
||||
private String instance = read("eu/dnetlib/dhp/oa/provision/template/instance.st");
|
||||
|
||||
private String rel = read("eu/dnetlib/dhp/graph/template/rel.st");
|
||||
private String rel = read("eu/dnetlib/dhp/oa/provision/template/rel.st");
|
||||
|
||||
private String webresource = read("eu/dnetlib/dhp/graph/template/webresource.st");
|
||||
private String webresource = read("eu/dnetlib/dhp/oa/provision/template/webresource.st");
|
||||
|
||||
private String child = read("eu/dnetlib/dhp/graph/template/child.st");
|
||||
private String child = read("eu/dnetlib/dhp/oa/provision/template/child.st");
|
||||
|
||||
private String entity = read("eu/dnetlib/dhp/graph/template/entity.st");
|
||||
private String entity = read("eu/dnetlib/dhp/oa/provision/template/entity.st");
|
||||
|
||||
private static String read(final String classpathResource) throws IOException {
|
||||
return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8);
|
||||
|
|
|
@ -58,6 +58,10 @@
|
|||
|
||||
<action name="adjancency_lists">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<prepare>
|
||||
<delete path="${outputPath}"/>
|
||||
<mkdir path="${outputPath}"/>
|
||||
</prepare>
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>build_adjacency_lists</name>
|
||||
|
@ -67,7 +71,6 @@
|
|||
--executor-cores ${sparkExecutorCoresForJoining}
|
||||
--executor-memory ${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForJoining}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
6
pom.xml
6
pom.xml
|
@ -1,11 +1,9 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<version>1.1.7-SNAPSHOT</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<url>http://www.d-net.research-infrastructures.eu</url>
|
||||
|
|
Loading…
Reference in New Issue