Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop

This commit is contained in:
Sandro La Bruzzo 2020-03-31 14:41:41 +02:00
commit 201d79021e
19 changed files with 29 additions and 63 deletions

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build-assembly-resources</artifactId> <artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId> <artifactId>dhp-build-properties-maven-plugin</artifactId>
@ -102,7 +102,7 @@
</goals> </goals>
</pluginExecutionFilter> </pluginExecutionFilter>
<action> <action>
<ignore></ignore> <ignore />
</action> </action>
</pluginExecution> </pluginExecution>
</pluginExecutions> </pluginExecutions>

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<packaging>pom</packaging> <packaging>pom</packaging>

View File

@ -5,7 +5,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
<relativePath>../</relativePath> <relativePath>../</relativePath>
</parent> </parent>

View File

@ -5,7 +5,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
<relativePath>../</relativePath> <relativePath>../</relativePath>
</parent> </parent>

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-aggregation</artifactId> <artifactId>dhp-aggregation</artifactId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-openaire</artifactId> <artifactId>dhp-dedup-openaire</artifactId>

View File

@ -1,10 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -1,13 +0,0 @@
----------------------------------------------------------------
Thu Mar 26 19:43:00 CET 2020:
Booting Derby version The Apache Software Foundation - Apache Derby - 10.12.1.1 - (1704137): instance a816c00e-0171-1827-9724-000012c70f40
on database directory /private/var/folders/xn/nr5vdk8n1572rvrnx5890_d80000gn/T/junit3871072562876431144/junit_metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@4e6b5ed4
Loaded from file:/Users/claudio/.m2/repository/org/apache/derby/derby/10.12.1.1/derby-10.12.1.1.jar
java.vendor=Oracle Corporation
java.runtime.version=1.8.0_181-b13
user.dir=/Users/claudio/workspace/git/dnet-hadoop/dhp-workflows/dhp-graph-mapper
os.name=Mac OS X
os.arch=x86_64
os.version=10.15.3
derby.system.home=null
Database Class Loader started - derby.database.classpath=''

View File

@ -1,10 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -1,10 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -1,14 +0,0 @@
sparkExecutorCoresForJoining=1
sparkDriverMemoryForJoining=10G
sparkExecutorMemoryForJoining=15G
sparkExecutorCoresForIndexing=64
sparkDriverMemoryForIndexing=3G
sparkExecutorMemoryForIndexing=2G
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
outputPath=/tmp/openaire_provision
format=TMF
batchSize=2000
reuseRecords=false
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -29,14 +29,9 @@ public class SparkXmlRecordBuilderJob {
final String otherDsTypeId = parser.get("otherDsTypeId"); final String otherDsTypeId = parser.get("otherDsTypeId");
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
if (fs.exists(new Path(outputPath))) {
fs.delete(new Path(outputPath), true);
fs.mkdirs(new Path(outputPath));
}
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath) new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
.adjacencyLists(); .adjacencyLists();
//.asXML();
} }
} }

View File

@ -7,17 +7,17 @@ import java.nio.charset.StandardCharsets;
public class TemplateResources { public class TemplateResources {
private String record = read("eu/dnetlib/dhp/graph/template/record.st"); private String record = read("eu/dnetlib/dhp/oa/provision/template/record.st");
private String instance = read("eu/dnetlib/dhp/graph/template/instance.st"); private String instance = read("eu/dnetlib/dhp/oa/provision/template/instance.st");
private String rel = read("eu/dnetlib/dhp/graph/template/rel.st"); private String rel = read("eu/dnetlib/dhp/oa/provision/template/rel.st");
private String webresource = read("eu/dnetlib/dhp/graph/template/webresource.st"); private String webresource = read("eu/dnetlib/dhp/oa/provision/template/webresource.st");
private String child = read("eu/dnetlib/dhp/graph/template/child.st"); private String child = read("eu/dnetlib/dhp/oa/provision/template/child.st");
private String entity = read("eu/dnetlib/dhp/graph/template/entity.st"); private String entity = read("eu/dnetlib/dhp/oa/provision/template/entity.st");
private static String read(final String classpathResource) throws IOException { private static String read(final String classpathResource) throws IOException {
return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8); return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8);

View File

@ -58,6 +58,10 @@
<action name="adjancency_lists"> <action name="adjancency_lists">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<prepare>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</prepare>
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>build_adjacency_lists</name> <name>build_adjacency_lists</name>
@ -67,7 +71,6 @@
--executor-cores ${sparkExecutorCoresForJoining} --executor-cores ${sparkExecutorCoresForJoining}
--executor-memory ${sparkExecutorMemoryForJoining} --executor-memory ${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining}
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForJoining}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
<relativePath>../</relativePath> <relativePath>../</relativePath>
</parent> </parent>

View File

@ -1,11 +1,9 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<url>http://www.d-net.research-infrastructures.eu</url> <url>http://www.d-net.research-infrastructures.eu</url>