forked from D-Net/dnet-hadoop
refactored structure of the project
This commit is contained in:
parent
c10770cd3e
commit
1eb0281b38
|
@ -1,7 +0,0 @@
|
|||
Module utilized by `dhp-wf`.
|
||||
|
||||
Contains all required resources by this parent module:
|
||||
|
||||
* assembly XML definitions
|
||||
* build shell scripts
|
||||
* oozie package commands for uploading, running and monitoring oozie workflows
|
|
@ -1,24 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
|
@ -1,32 +0,0 @@
|
|||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
|
||||
<id>oozie-installer</id>
|
||||
<formats>
|
||||
<format>dir</format>
|
||||
</formats>
|
||||
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<filtered>true</filtered>
|
||||
<directory>${project.build.directory}/assembly-resources/commands</directory>
|
||||
<!--
|
||||
dziala dla (lokalnie zasoby modulu):
|
||||
<directory>src/main/resources</directory>
|
||||
nie dziala dla:
|
||||
<directory>classpath:/commands</directory>
|
||||
<directory>commands</directory>
|
||||
<directory>classpath/src/main/resources</directory>
|
||||
-->
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<includes>
|
||||
<include>**/*</include>
|
||||
</includes>
|
||||
<fileMode>0755</fileMode>
|
||||
<lineEnding>unix</lineEnding>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
<baseDirectory>/</baseDirectory>
|
||||
</assembly>
|
|
@ -1,24 +0,0 @@
|
|||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
|
||||
<id>tests</id>
|
||||
<formats>
|
||||
<format>jar</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<directory>${project.build.testOutputDirectory}
|
||||
</directory>
|
||||
<outputDirectory />
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
<!-- <dependencySets>
|
||||
<dependencySet>
|
||||
<useProjectArtifact>false</useProjectArtifact>
|
||||
<outputDirectory>lib</outputDirectory>
|
||||
</dependencySet>
|
||||
</dependencySets>-->
|
||||
</assembly>
|
|
@ -1,3 +0,0 @@
|
|||
#!/bin/bash
|
||||
hadoop fs -get ${workingDir}
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
#!/bin/bash
|
||||
echo ""
|
||||
echo "---->Contents of the working directory"
|
||||
hadoop fs -ls ${workingDir}
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
Execute the scripts in the following order:
|
||||
|
||||
1. `upload_workflow.sh`
|
||||
2. `run_workflow.sh`
|
||||
3. `print_working_dir.sh` or `get_working_dir.sh`
|
|
@ -1,10 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ $# = 0 ] ; then
|
||||
oozie job -oozie ${oozieServiceLoc} -config job.properties -run
|
||||
else
|
||||
oozie job -oozie ${oozieServiceLoc} -config $1/job.properties -run
|
||||
fi
|
||||
|
||||
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
#!/bin/bash
|
||||
exec 3>&1
|
||||
BASH_XTRACEFD=3
|
||||
set -x ## print every executed command
|
||||
|
||||
|
||||
if [ $# = 0 ] ; then
|
||||
target_dir_root=`pwd`'/${oozieAppDir}'
|
||||
else
|
||||
target_dir_root=`readlink -f $1`'/${oozieAppDir}'
|
||||
fi
|
||||
|
||||
# initial phase, creating symbolic links to jars in all subworkflows
|
||||
# currently disabled
|
||||
#libDir=$target_dir_root'/lib'
|
||||
#dirs=`find $target_dir_root/* -maxdepth 10 -type d`
|
||||
#for dir in $dirs
|
||||
#do
|
||||
# if [ -f $dir/workflow.xml ]
|
||||
# then
|
||||
# echo "creating symbolic links to jars in directory: $dir/lib"
|
||||
# if [ ! -d "$dir/lib" ]; then
|
||||
# mkdir $dir/lib
|
||||
# fi
|
||||
# find $libDir -type f -exec ln -s \{\} $dir/lib \;
|
||||
# fi
|
||||
#done
|
||||
|
||||
|
||||
#uploading
|
||||
hadoop fs -rm -r ${sandboxDir}
|
||||
hadoop fs -mkdir -p ${sandboxDir}
|
||||
hadoop fs -mkdir -p ${workingDir}
|
||||
hadoop fs -put $target_dir_root ${sandboxDir}
|
|
@ -1,7 +0,0 @@
|
|||
#sandboxName when not provided explicitly will be generated
|
||||
sandboxName=${sandboxName}
|
||||
sandboxDir=/user/${iis.hadoop.frontend.user.name}/${sandboxName}
|
||||
workingDir=${sandboxDir}/working_dir
|
||||
oozie.wf.application.path = ${nameNode}${sandboxDir}/${oozieAppDir}
|
||||
oozieTopWfApplicationPath = ${oozie.wf.application.path}
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
|
||||
<id>oozie-installer</id>
|
||||
<formats>
|
||||
<format>dir</format>
|
||||
</formats>
|
||||
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<filtered>true</filtered>
|
||||
<directory>${project.build.directory}/assembly-resources/commands</directory>
|
||||
<!--
|
||||
dziala dla (lokalnie zasoby modulu):
|
||||
<directory>src/main/resources</directory>
|
||||
nie dziala dla:
|
||||
<directory>classpath:/commands</directory>
|
||||
<directory>commands</directory>
|
||||
<directory>classpath/src/main/resources</directory>
|
||||
-->
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<includes>
|
||||
<include>**/*</include>
|
||||
</includes>
|
||||
<fileMode>0755</fileMode>
|
||||
<lineEnding>unix</lineEnding>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
<baseDirectory>/</baseDirectory>
|
||||
</assembly>
|
|
@ -1,24 +0,0 @@
|
|||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
|
||||
<id>tests</id>
|
||||
<formats>
|
||||
<format>jar</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<directory>${project.build.testOutputDirectory}
|
||||
</directory>
|
||||
<outputDirectory />
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
<!-- <dependencySets>
|
||||
<dependencySet>
|
||||
<useProjectArtifact>false</useProjectArtifact>
|
||||
<outputDirectory>lib</outputDirectory>
|
||||
</dependencySet>
|
||||
</dependencySets>-->
|
||||
</assembly>
|
|
@ -1,3 +0,0 @@
|
|||
#!/bin/bash
|
||||
hadoop fs -get ${workingDir}
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
#!/bin/bash
|
||||
echo ""
|
||||
echo "---->Contents of the working directory"
|
||||
hadoop fs -ls ${workingDir}
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
Execute the scripts in the following order:
|
||||
|
||||
1. `upload_workflow.sh`
|
||||
2. `run_workflow.sh`
|
||||
3. `print_working_dir.sh` or `get_working_dir.sh`
|
|
@ -1,10 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ $# = 0 ] ; then
|
||||
oozie job -oozie ${oozieServiceLoc} -config job.properties -run
|
||||
else
|
||||
oozie job -oozie ${oozieServiceLoc} -config $1/job.properties -run
|
||||
fi
|
||||
|
||||
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
#!/bin/bash
|
||||
exec 3>&1
|
||||
BASH_XTRACEFD=3
|
||||
set -x ## print every executed command
|
||||
|
||||
|
||||
if [ $# = 0 ] ; then
|
||||
target_dir_root=`pwd`'/${oozieAppDir}'
|
||||
else
|
||||
target_dir_root=`readlink -f $1`'/${oozieAppDir}'
|
||||
fi
|
||||
|
||||
# initial phase, creating symbolic links to jars in all subworkflows
|
||||
# currently disabled
|
||||
#libDir=$target_dir_root'/lib'
|
||||
#dirs=`find $target_dir_root/* -maxdepth 10 -type d`
|
||||
#for dir in $dirs
|
||||
#do
|
||||
# if [ -f $dir/workflow.xml ]
|
||||
# then
|
||||
# echo "creating symbolic links to jars in directory: $dir/lib"
|
||||
# if [ ! -d "$dir/lib" ]; then
|
||||
# mkdir $dir/lib
|
||||
# fi
|
||||
# find $libDir -type f -exec ln -s \{\} $dir/lib \;
|
||||
# fi
|
||||
#done
|
||||
|
||||
|
||||
#uploading
|
||||
hadoop fs -rm -r ${sandboxDir}
|
||||
hadoop fs -mkdir -p ${sandboxDir}
|
||||
hadoop fs -mkdir -p ${workingDir}
|
||||
hadoop fs -put $target_dir_root ${sandboxDir}
|
|
@ -1,7 +0,0 @@
|
|||
#sandboxName when not provided explicitly will be generated
|
||||
sandboxName=${sandboxName}
|
||||
sandboxDir=/user/${iis.hadoop.frontend.user.name}/${sandboxName}
|
||||
workingDir=${sandboxDir}/working_dir
|
||||
oozie.wf.application.path = ${nameNode}${sandboxDir}/${oozieAppDir}
|
||||
oozieTopWfApplicationPath = ${oozie.wf.application.path}
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
Maven plugin module utilized by `dhp-wf` for proper `job.properties` file building.
|
||||
|
||||
It is based on http://site.kuali.org/maven/plugins/properties-maven-plugin/1.3.2/write-project-properties-mojo.html and supplemented with:
|
||||
|
||||
* handling includePropertyKeysFromFiles property allowing writing only properties listed in given property files
|
||||
As a final outcome only properties listed in `<include>` element and listed as a keys in files from `<includePropertyKeysFromFiles>` element will be written to output file.
|
|
@ -1,68 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||
<packaging>maven-plugin</packaging>
|
||||
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven</groupId>
|
||||
<artifactId>maven-plugin-api</artifactId>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven</groupId>
|
||||
<artifactId>maven-project</artifactId>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.kuali.maven.plugins</groupId>
|
||||
<artifactId>properties-maven-plugin</artifactId>
|
||||
<version>1.3.2</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
<build>
|
||||
<directory>target</directory>
|
||||
<outputDirectory>target/classes</outputDirectory>
|
||||
<finalName>${project.artifactId}-${project.version}</finalName>
|
||||
<testOutputDirectory>target/test-classes</testOutputDirectory>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-sources</id>
|
||||
<phase>verify</phase>
|
||||
<goals>
|
||||
<goal>jar-no-fork</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<configuration>
|
||||
<detectLinks>true</detectLinks>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
|
@ -1,71 +0,0 @@
|
|||
package eu.dnetlib.maven.plugin.properties;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.maven.plugin.AbstractMojo;
|
||||
import org.apache.maven.plugin.MojoExecutionException;
|
||||
import org.apache.maven.plugin.MojoFailureException;
|
||||
|
||||
/**
|
||||
* Generates oozie properties which were not provided from commandline.
|
||||
* @author mhorst
|
||||
*
|
||||
* @goal generate-properties
|
||||
*/
|
||||
public class GenerateOoziePropertiesMojo extends AbstractMojo {
|
||||
|
||||
public static final String PROPERTY_NAME_WF_SOURCE_DIR = "workflow.source.dir";
|
||||
public static final String PROPERTY_NAME_SANDBOX_NAME = "sandboxName";
|
||||
|
||||
private final String[] limiters = {"iis", "dnetlib", "eu", "dhp"};
|
||||
|
||||
@Override
|
||||
public void execute() throws MojoExecutionException, MojoFailureException {
|
||||
if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) &&
|
||||
!System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) {
|
||||
String generatedSandboxName = generateSandboxName(System.getProperties().getProperty(
|
||||
PROPERTY_NAME_WF_SOURCE_DIR));
|
||||
if (generatedSandboxName!=null) {
|
||||
System.getProperties().setProperty(PROPERTY_NAME_SANDBOX_NAME,
|
||||
generatedSandboxName);
|
||||
} else {
|
||||
System.out.println("unable to generate sandbox name from path: " +
|
||||
System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates sandbox name from workflow source directory.
|
||||
* @param wfSourceDir
|
||||
* @return generated sandbox name
|
||||
*/
|
||||
private String generateSandboxName(String wfSourceDir) {
|
||||
// utilize all dir names until finding one of the limiters
|
||||
List<String> sandboxNameParts = new ArrayList<String>();
|
||||
String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar);
|
||||
ArrayUtils.reverse(tokens);
|
||||
if (tokens.length>0) {
|
||||
for (String token : tokens) {
|
||||
for (String limiter : limiters) {
|
||||
if (limiter.equals(token)) {
|
||||
return sandboxNameParts.size()>0?
|
||||
StringUtils.join(sandboxNameParts.toArray()):null;
|
||||
}
|
||||
}
|
||||
if (sandboxNameParts.size()>0) {
|
||||
sandboxNameParts.add(0, File.separator);
|
||||
}
|
||||
sandboxNameParts.add(0, token);
|
||||
}
|
||||
return StringUtils.join(sandboxNameParts.toArray());
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,436 +0,0 @@
|
|||
/**
|
||||
*
|
||||
* Licensed under the Educational Community License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.opensource.org/licenses/ecl2.php
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package eu.dnetlib.maven.plugin.properties;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.maven.plugin.AbstractMojo;
|
||||
import org.apache.maven.plugin.MojoExecutionException;
|
||||
import org.apache.maven.plugin.MojoFailureException;
|
||||
import org.apache.maven.project.MavenProject;
|
||||
import org.springframework.core.io.DefaultResourceLoader;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.core.io.ResourceLoader;
|
||||
|
||||
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
|
||||
|
||||
/**
|
||||
* Writes project properties for the keys listed in specified properties files.
|
||||
* Based on:
|
||||
* http://site.kuali.org/maven/plugins/properties-maven-plugin/1.3.2/write-project-properties-mojo.html
|
||||
|
||||
* @author mhorst
|
||||
* @goal write-project-properties
|
||||
*/
|
||||
public class WritePredefinedProjectProperties extends AbstractMojo {
|
||||
|
||||
private static final String CR = "\r";
|
||||
private static final String LF = "\n";
|
||||
private static final String TAB = "\t";
|
||||
protected static final String PROPERTY_PREFIX_ENV = "env.";
|
||||
private static final String ENCODING_UTF8 = "utf8";
|
||||
|
||||
/**
|
||||
* @parameter property="properties.includePropertyKeysFromFiles"
|
||||
*/
|
||||
private String[] includePropertyKeysFromFiles;
|
||||
|
||||
/**
|
||||
* @parameter default-value="${project}"
|
||||
* @required
|
||||
* @readonly
|
||||
*/
|
||||
protected MavenProject project;
|
||||
|
||||
/**
|
||||
* The file that properties will be written to
|
||||
*
|
||||
* @parameter property="properties.outputFile"
|
||||
* default-value="${project.build.directory}/properties/project.properties";
|
||||
* @required
|
||||
*/
|
||||
protected File outputFile;
|
||||
|
||||
/**
|
||||
* If true, the plugin will silently ignore any non-existent properties files, and the build will continue
|
||||
*
|
||||
* @parameter property="properties.quiet" default-value="true"
|
||||
*/
|
||||
private boolean quiet;
|
||||
|
||||
/**
|
||||
* Comma separated list of characters to escape when writing property values. cr=carriage return, lf=linefeed,
|
||||
* tab=tab. Any other values are taken literally.
|
||||
*
|
||||
* @parameter default-value="cr,lf,tab" property="properties.escapeChars"
|
||||
*/
|
||||
private String escapeChars;
|
||||
|
||||
/**
|
||||
* If true, the plugin will include system properties when writing the properties file. System properties override
|
||||
* both environment variables and project properties.
|
||||
*
|
||||
* @parameter default-value="false" property="properties.includeSystemProperties"
|
||||
*/
|
||||
private boolean includeSystemProperties;
|
||||
|
||||
/**
|
||||
* If true, the plugin will include environment variables when writing the properties file. Environment variables
|
||||
* are prefixed with "env". Environment variables override project properties.
|
||||
*
|
||||
* @parameter default-value="false" property="properties.includeEnvironmentVariables"
|
||||
*/
|
||||
private boolean includeEnvironmentVariables;
|
||||
|
||||
/**
|
||||
* Comma separated set of properties to exclude when writing the properties file
|
||||
*
|
||||
* @parameter property="properties.exclude"
|
||||
*/
|
||||
private String exclude;
|
||||
|
||||
/**
|
||||
* Comma separated set of properties to write to the properties file. If provided, only the properties matching
|
||||
* those supplied here will be written to the properties file.
|
||||
*
|
||||
* @parameter property="properties.include"
|
||||
*/
|
||||
private String include;
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.maven.plugin.AbstractMojo#execute()
|
||||
*/
|
||||
@Override
|
||||
@SuppressFBWarnings({"NP_UNWRITTEN_FIELD","UWF_UNWRITTEN_FIELD"})
|
||||
public void execute() throws MojoExecutionException, MojoFailureException {
|
||||
Properties properties = new Properties();
|
||||
// Add project properties
|
||||
properties.putAll(project.getProperties());
|
||||
if (includeEnvironmentVariables) {
|
||||
// Add environment variables, overriding any existing properties with the same key
|
||||
properties.putAll(getEnvironmentVariables());
|
||||
}
|
||||
if (includeSystemProperties) {
|
||||
// Add system properties, overriding any existing properties with the same key
|
||||
properties.putAll(System.getProperties());
|
||||
}
|
||||
|
||||
// Remove properties as appropriate
|
||||
trim(properties, exclude, include);
|
||||
|
||||
String comment = "# " + new Date() + "\n";
|
||||
List<String> escapeTokens = getEscapeChars(escapeChars);
|
||||
|
||||
getLog().info("Creating " + outputFile);
|
||||
writeProperties(outputFile, comment, properties, escapeTokens);
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides environment variables.
|
||||
* @return environment variables
|
||||
*/
|
||||
protected static Properties getEnvironmentVariables() {
|
||||
Properties props = new Properties();
|
||||
for (Entry<String, String> entry : System.getenv().entrySet()) {
|
||||
props.setProperty(PROPERTY_PREFIX_ENV + entry.getKey(), entry.getValue());
|
||||
}
|
||||
return props;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes properties which should not be written.
|
||||
* @param properties
|
||||
* @param omitCSV
|
||||
* @param includeCSV
|
||||
* @throws MojoExecutionException
|
||||
*/
|
||||
protected void trim(Properties properties, String omitCSV, String includeCSV) throws MojoExecutionException {
|
||||
List<String> omitKeys = getListFromCSV(omitCSV);
|
||||
for (String key : omitKeys) {
|
||||
properties.remove(key);
|
||||
}
|
||||
|
||||
List<String> includeKeys = getListFromCSV(includeCSV);
|
||||
// mh: including keys from predefined properties
|
||||
if (includePropertyKeysFromFiles!=null && includePropertyKeysFromFiles.length>0) {
|
||||
for (String currentIncludeLoc : includePropertyKeysFromFiles) {
|
||||
if (validate(currentIncludeLoc)) {
|
||||
Properties p = getProperties(currentIncludeLoc);
|
||||
for (String key : p.stringPropertyNames()) {
|
||||
includeKeys.add(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (includeKeys!=null && !includeKeys.isEmpty()) {
|
||||
// removing only when include keys provided
|
||||
Set<String> keys = properties.stringPropertyNames();
|
||||
for (String key : keys) {
|
||||
if (!includeKeys.contains(key)) {
|
||||
properties.remove(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether file exists.
|
||||
* @param location
|
||||
* @return true when exists, false otherwise.
|
||||
*/
|
||||
protected boolean exists(String location) {
|
||||
if (StringUtils.isBlank(location)) {
|
||||
return false;
|
||||
}
|
||||
File file = new File(location);
|
||||
if (file.exists()) {
|
||||
return true;
|
||||
}
|
||||
ResourceLoader loader = new DefaultResourceLoader();
|
||||
Resource resource = loader.getResource(location);
|
||||
return resource.exists();
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates resource location.
|
||||
* @param location
|
||||
* @return true when valid, false otherwise
|
||||
* @throws MojoExecutionException
|
||||
*/
|
||||
protected boolean validate(String location) throws MojoExecutionException {
|
||||
boolean exists = exists(location);
|
||||
if (exists) {
|
||||
return true;
|
||||
}
|
||||
if (quiet) {
|
||||
getLog().info("Ignoring non-existent properties file '" + location + "'");
|
||||
return false;
|
||||
} else {
|
||||
throw new MojoExecutionException("Non-existent properties file '" + location + "'");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides input stream.
|
||||
* @param location
|
||||
* @return input stream
|
||||
* @throws IOException
|
||||
*/
|
||||
protected InputStream getInputStream(String location) throws IOException {
|
||||
File file = new File(location);
|
||||
if (file.exists()) {
|
||||
return new FileInputStream(location);
|
||||
}
|
||||
ResourceLoader loader = new DefaultResourceLoader();
|
||||
Resource resource = loader.getResource(location);
|
||||
return resource.getInputStream();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates properties for given location.
|
||||
* @param location
|
||||
* @return properties for given location
|
||||
* @throws MojoExecutionException
|
||||
*/
|
||||
protected Properties getProperties(String location) throws MojoExecutionException {
|
||||
InputStream in = null;
|
||||
try {
|
||||
Properties properties = new Properties();
|
||||
in = getInputStream(location);
|
||||
if (location.toLowerCase().endsWith(".xml")) {
|
||||
properties.loadFromXML(in);
|
||||
} else {
|
||||
properties.load(in);
|
||||
}
|
||||
return properties;
|
||||
} catch (IOException e) {
|
||||
throw new MojoExecutionException("Error reading properties file " + location, e);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(in);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides escape characters.
|
||||
* @param escapeChars
|
||||
* @return escape characters
|
||||
*/
|
||||
protected List<String> getEscapeChars(String escapeChars) {
|
||||
List<String> tokens = getListFromCSV(escapeChars);
|
||||
List<String> realTokens = new ArrayList<String>();
|
||||
for (String token : tokens) {
|
||||
String realToken = getRealToken(token);
|
||||
realTokens.add(realToken);
|
||||
}
|
||||
return realTokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides real token.
|
||||
* @param token
|
||||
* @return real token
|
||||
*/
|
||||
protected String getRealToken(String token) {
|
||||
if (token.equalsIgnoreCase("CR")) {
|
||||
return CR;
|
||||
} else if (token.equalsIgnoreCase("LF")) {
|
||||
return LF;
|
||||
} else if (token.equalsIgnoreCase("TAB")) {
|
||||
return TAB;
|
||||
} else {
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns content.
|
||||
* @param comment
|
||||
* @param properties
|
||||
* @param escapeTokens
|
||||
* @return content
|
||||
*/
|
||||
protected String getContent(String comment, Properties properties, List<String> escapeTokens) {
|
||||
List<String> names = new ArrayList<String>(properties.stringPropertyNames());
|
||||
Collections.sort(names);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (!StringUtils.isBlank(comment)) {
|
||||
sb.append(comment);
|
||||
}
|
||||
for (String name : names) {
|
||||
String value = properties.getProperty(name);
|
||||
String escapedValue = escape(value, escapeTokens);
|
||||
sb.append(name + "=" + escapedValue + "\n");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes properties to given file.
|
||||
* @param file
|
||||
* @param comment
|
||||
* @param properties
|
||||
* @param escapeTokens
|
||||
* @throws MojoExecutionException
|
||||
*/
|
||||
protected void writeProperties(File file, String comment, Properties properties, List<String> escapeTokens)
|
||||
throws MojoExecutionException {
|
||||
try {
|
||||
String content = getContent(comment, properties, escapeTokens);
|
||||
FileUtils.writeStringToFile(file, content, ENCODING_UTF8);
|
||||
} catch (IOException e) {
|
||||
throw new MojoExecutionException("Error creating properties file", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes characters.
|
||||
* @param s
|
||||
* @param escapeChars
|
||||
* @return
|
||||
*/
|
||||
protected String escape(String s, List<String> escapeChars) {
|
||||
String result = s;
|
||||
for (String escapeChar : escapeChars) {
|
||||
result = result.replace(escapeChar, getReplacementToken(escapeChar));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides replacement token.
|
||||
* @param escapeChar
|
||||
* @return replacement token
|
||||
*/
|
||||
protected String getReplacementToken(String escapeChar) {
|
||||
if (escapeChar.equals(CR)) {
|
||||
return "\\r";
|
||||
} else if (escapeChar.equals(LF)) {
|
||||
return "\\n";
|
||||
} else if (escapeChar.equals(TAB)) {
|
||||
return "\\t";
|
||||
} else {
|
||||
return "\\" + escapeChar;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns list from csv.
|
||||
* @param csv
|
||||
* @return list of values generated from CSV
|
||||
*/
|
||||
protected static final List<String> getListFromCSV(String csv) {
|
||||
if (StringUtils.isBlank(csv)) {
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
List<String> list = new ArrayList<String>();
|
||||
String[] tokens = StringUtils.split(csv, ",");
|
||||
for (String token : tokens) {
|
||||
list.add(token.trim());
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public void setIncludeSystemProperties(boolean includeSystemProperties) {
|
||||
this.includeSystemProperties = includeSystemProperties;
|
||||
}
|
||||
|
||||
public void setEscapeChars(String escapeChars) {
|
||||
this.escapeChars = escapeChars;
|
||||
}
|
||||
|
||||
public void setIncludeEnvironmentVariables(boolean includeEnvironmentVariables) {
|
||||
this.includeEnvironmentVariables = includeEnvironmentVariables;
|
||||
}
|
||||
|
||||
public void setExclude(String exclude) {
|
||||
this.exclude = exclude;
|
||||
}
|
||||
|
||||
public void setInclude(String include) {
|
||||
this.include = include;
|
||||
}
|
||||
|
||||
public void setQuiet(boolean quiet) {
|
||||
this.quiet = quiet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets property files for which keys properties should be included.
|
||||
* @param includePropertyKeysFromFiles
|
||||
*/
|
||||
public void setIncludePropertyKeysFromFiles(
|
||||
String[] includePropertyKeysFromFiles) {
|
||||
if (includePropertyKeysFromFiles!=null) {
|
||||
this.includePropertyKeysFromFiles = Arrays.copyOf(
|
||||
includePropertyKeysFromFiles,
|
||||
includePropertyKeysFromFiles.length);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,101 +0,0 @@
|
|||
package eu.dnetlib.maven.plugin.properties;
|
||||
|
||||
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME;
|
||||
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_WF_SOURCE_DIR;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNull;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public class GenerateOoziePropertiesMojoTest {
|
||||
|
||||
private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo();
|
||||
|
||||
@Before
|
||||
public void clearSystemProperties() {
|
||||
System.clearProperty(PROPERTY_NAME_SANDBOX_NAME);
|
||||
System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteEmpty() throws Exception {
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteSandboxNameAlreadySet() throws Exception {
|
||||
// given
|
||||
String workflowSourceDir = "eu/dnetlib/iis/wf/transformers";
|
||||
String sandboxName = "originalSandboxName";
|
||||
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
|
||||
System.setProperty(PROPERTY_NAME_SANDBOX_NAME, sandboxName);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertEquals(sandboxName, System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteEmptyWorkflowSourceDir() throws Exception {
|
||||
// given
|
||||
String workflowSourceDir = "";
|
||||
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteNullSandboxNameGenerated() throws Exception {
|
||||
// given
|
||||
String workflowSourceDir = "eu/dnetlib/iis/";
|
||||
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecute() throws Exception {
|
||||
// given
|
||||
String workflowSourceDir = "eu/dnetlib/iis/wf/transformers";
|
||||
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithoutRoot() throws Exception {
|
||||
// given
|
||||
String workflowSourceDir = "wf/transformers";
|
||||
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,365 +0,0 @@
|
|||
package eu.dnetlib.maven.plugin.properties;
|
||||
|
||||
import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.doReturn;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.maven.plugin.MojoExecutionException;
|
||||
import org.apache.maven.project.MavenProject;
|
||||
import org.junit.Before;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.runners.MockitoJUnitRunner;
|
||||
|
||||
|
||||
/**
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
@RunWith(MockitoJUnitRunner.class)
|
||||
public class WritePredefinedProjectPropertiesTest {
|
||||
|
||||
@Rule
|
||||
public TemporaryFolder testFolder = new TemporaryFolder();
|
||||
|
||||
@Mock
|
||||
private MavenProject mavenProject;
|
||||
|
||||
private WritePredefinedProjectProperties mojo;
|
||||
|
||||
@Before
|
||||
public void init() {
|
||||
mojo = new WritePredefinedProjectProperties();
|
||||
mojo.outputFile = getPropertiesFileLocation();
|
||||
mojo.project = mavenProject;
|
||||
doReturn(new Properties()).when(mavenProject).getProperties();
|
||||
}
|
||||
|
||||
// ----------------------------------- TESTS ---------------------------------------------
|
||||
|
||||
@Test
|
||||
public void testExecuteEmpty() throws Exception {
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertEquals(0, storedProperties.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithProjectProperties() throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(key));
|
||||
assertEquals(value, storedProperties.getProperty(key));
|
||||
}
|
||||
|
||||
@Test(expected=MojoExecutionException.class)
|
||||
public void testExecuteWithProjectPropertiesAndInvalidOutputFile() throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
mojo.outputFile = testFolder.getRoot();
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithProjectPropertiesExclusion() throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
String excludedKey = "excludedPropertyKey";
|
||||
String excludedValue = "excludedPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
projectProperties.setProperty(excludedKey, excludedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
mojo.setExclude(excludedKey);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(key));
|
||||
assertEquals(value, storedProperties.getProperty(key));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithProjectPropertiesInclusion() throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
String includedKey = "includedPropertyKey";
|
||||
String includedValue = "includedPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
projectProperties.setProperty(includedKey, includedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
mojo.setInclude(includedKey);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(includedKey));
|
||||
assertEquals(includedValue, storedProperties.getProperty(includedKey));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteIncludingPropertyKeysFromFile() throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
String includedKey = "includedPropertyKey";
|
||||
String includedValue = "includedPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
projectProperties.setProperty(includedKey, includedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
|
||||
File includedPropertiesFile = new File(testFolder.getRoot(), "included.properties");
|
||||
Properties includedProperties = new Properties();
|
||||
includedProperties.setProperty(includedKey, "irrelevantValue");
|
||||
includedProperties.store(new FileWriter(includedPropertiesFile), null);
|
||||
|
||||
mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(includedKey));
|
||||
assertEquals(includedValue, storedProperties.getProperty(includedKey));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteIncludingPropertyKeysFromClasspathResource() throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
String includedKey = "includedPropertyKey";
|
||||
String includedValue = "includedPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
projectProperties.setProperty(includedKey, includedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
|
||||
mojo.setIncludePropertyKeysFromFiles(new String[] {"/eu/dnetlib/maven/plugin/properties/included.properties"});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(includedKey));
|
||||
assertEquals(includedValue, storedProperties.getProperty(includedKey));
|
||||
}
|
||||
|
||||
@Test(expected=MojoExecutionException.class)
|
||||
public void testExecuteIncludingPropertyKeysFromBlankLocation() throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
String includedKey = "includedPropertyKey";
|
||||
String includedValue = "includedPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
projectProperties.setProperty(includedKey, includedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
|
||||
mojo.setIncludePropertyKeysFromFiles(new String[] {""});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteIncludingPropertyKeysFromXmlFile() throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
String includedKey = "includedPropertyKey";
|
||||
String includedValue = "includedPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
projectProperties.setProperty(includedKey, includedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
|
||||
File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml");
|
||||
Properties includedProperties = new Properties();
|
||||
includedProperties.setProperty(includedKey, "irrelevantValue");
|
||||
includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null);
|
||||
|
||||
mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(includedKey));
|
||||
assertEquals(includedValue, storedProperties.getProperty(includedKey));
|
||||
}
|
||||
|
||||
@Test(expected=MojoExecutionException.class)
|
||||
public void testExecuteIncludingPropertyKeysFromInvalidXmlFile() throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
String includedKey = "includedPropertyKey";
|
||||
String includedValue = "includedPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
projectProperties.setProperty(includedKey, includedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
|
||||
File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml");
|
||||
Properties includedProperties = new Properties();
|
||||
includedProperties.setProperty(includedKey, "irrelevantValue");
|
||||
includedProperties.store(new FileOutputStream(includedPropertiesFile), null);
|
||||
|
||||
mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithQuietModeOn() throws Exception {
|
||||
// given
|
||||
mojo.setQuiet(true);
|
||||
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertEquals(0, storedProperties.size());
|
||||
}
|
||||
|
||||
@Test(expected=MojoExecutionException.class)
|
||||
public void testExecuteIncludingPropertyKeysFromInvalidFile() throws Exception {
|
||||
// given
|
||||
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithEnvironmentProperties() throws Exception {
|
||||
// given
|
||||
mojo.setIncludeEnvironmentVariables(true);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertTrue(storedProperties.size() > 0);
|
||||
for (Object currentKey : storedProperties.keySet()) {
|
||||
assertTrue(((String)currentKey).startsWith(PROPERTY_PREFIX_ENV));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithSystemProperties() throws Exception {
|
||||
// given
|
||||
String key = "systemPropertyKey";
|
||||
String value = "systemPropertyValue";
|
||||
System.setProperty(key, value);
|
||||
mojo.setIncludeSystemProperties(true);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertTrue(storedProperties.size() > 0);
|
||||
assertTrue(storedProperties.containsKey(key));
|
||||
assertEquals(value, storedProperties.getProperty(key));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithSystemPropertiesAndEscapeChars() throws Exception {
|
||||
// given
|
||||
String key = "systemPropertyKey ";
|
||||
String value = "systemPropertyValue";
|
||||
System.setProperty(key, value);
|
||||
mojo.setIncludeSystemProperties(true);
|
||||
String escapeChars = "cr,lf,tab,|";
|
||||
mojo.setEscapeChars(escapeChars);
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
assertTrue(storedProperties.size() > 0);
|
||||
assertFalse(storedProperties.containsKey(key));
|
||||
assertTrue(storedProperties.containsKey(key.trim()));
|
||||
assertEquals(value, storedProperties.getProperty(key.trim()));
|
||||
}
|
||||
|
||||
// ----------------------------------- PRIVATE -------------------------------------------
|
||||
|
||||
private File getPropertiesFileLocation() {
|
||||
return new File(testFolder.getRoot(), "test.properties");
|
||||
}
|
||||
|
||||
private Properties getStoredProperties() throws FileNotFoundException, IOException {
|
||||
Properties properties = new Properties();
|
||||
properties.load(new FileInputStream(getPropertiesFileLocation()));
|
||||
return properties;
|
||||
}
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
includedPropertyKey=irrelevantValue
|
|
@ -1,281 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<plugin>
|
||||
<name>dhp-build-properties-maven-plugin</name>
|
||||
<description></description>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
<goalPrefix>dhp-build-properties</goalPrefix>
|
||||
<isolatedRealm>false</isolatedRealm>
|
||||
<inheritedByDefault>true</inheritedByDefault>
|
||||
<mojos>
|
||||
<mojo>
|
||||
<goal>generate-properties</goal>
|
||||
<description>Generates oozie properties which were not provided from commandline.</description>
|
||||
<requiresDirectInvocation>false</requiresDirectInvocation>
|
||||
<requiresProject>true</requiresProject>
|
||||
<requiresReports>false</requiresReports>
|
||||
<aggregator>false</aggregator>
|
||||
<requiresOnline>false</requiresOnline>
|
||||
<inheritedByDefault>true</inheritedByDefault>
|
||||
<implementation>eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo</implementation>
|
||||
<language>java</language>
|
||||
<instantiationStrategy>per-lookup</instantiationStrategy>
|
||||
<executionStrategy>once-per-session</executionStrategy>
|
||||
<threadSafe>false</threadSafe>
|
||||
<parameters/>
|
||||
</mojo>
|
||||
<mojo>
|
||||
<goal>write-project-properties</goal>
|
||||
<description>Writes project properties for the keys listed in specified properties files.
|
||||
Based on:
|
||||
http://site.kuali.org/maven/plugins/properties-maven-plugin/1.3.2/write-project-properties-mojo.html</description>
|
||||
<requiresDirectInvocation>false</requiresDirectInvocation>
|
||||
<requiresProject>true</requiresProject>
|
||||
<requiresReports>false</requiresReports>
|
||||
<aggregator>false</aggregator>
|
||||
<requiresOnline>false</requiresOnline>
|
||||
<inheritedByDefault>true</inheritedByDefault>
|
||||
<implementation>eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties</implementation>
|
||||
<language>java</language>
|
||||
<instantiationStrategy>per-lookup</instantiationStrategy>
|
||||
<executionStrategy>once-per-session</executionStrategy>
|
||||
<threadSafe>false</threadSafe>
|
||||
<parameters>
|
||||
<parameter>
|
||||
<name>properties.escapeChars</name>
|
||||
<type>java.lang.String</type>
|
||||
<required>false</required>
|
||||
<editable>true</editable>
|
||||
<description>Comma separated list of characters to escape when writing property values. cr=carriage return, lf=linefeed,
|
||||
tab=tab. Any other values are taken literally.</description>
|
||||
</parameter>
|
||||
<parameter>
|
||||
<name>properties.exclude</name>
|
||||
<type>java.lang.String</type>
|
||||
<required>false</required>
|
||||
<editable>true</editable>
|
||||
<description>Comma separated set of properties to exclude when writing the properties file</description>
|
||||
</parameter>
|
||||
<parameter>
|
||||
<name>properties.include</name>
|
||||
<type>java.lang.String</type>
|
||||
<required>false</required>
|
||||
<editable>true</editable>
|
||||
<description>Comma separated set of properties to write to the properties file. If provided, only the properties matching
|
||||
those supplied here will be written to the properties file.</description>
|
||||
</parameter>
|
||||
<parameter>
|
||||
<name>properties.includeEnvironmentVariables</name>
|
||||
<type>boolean</type>
|
||||
<required>false</required>
|
||||
<editable>true</editable>
|
||||
<description>If true, the plugin will include environment variables when writing the properties file. Environment variables
|
||||
are prefixed with "env". Environment variables override project properties.</description>
|
||||
</parameter>
|
||||
<parameter>
|
||||
<name>properties.includePropertyKeysFromFiles</name>
|
||||
<type>java.lang.String[]</type>
|
||||
<required>false</required>
|
||||
<editable>true</editable>
|
||||
<description></description>
|
||||
</parameter>
|
||||
<parameter>
|
||||
<name>properties.includeSystemProperties</name>
|
||||
<type>boolean</type>
|
||||
<required>false</required>
|
||||
<editable>true</editable>
|
||||
<description>If true, the plugin will include system properties when writing the properties file. System properties override
|
||||
both environment variables and project properties.</description>
|
||||
</parameter>
|
||||
<parameter>
|
||||
<name>properties.outputFile</name>
|
||||
<type>java.io.File</type>
|
||||
<required>true</required>
|
||||
<editable>true</editable>
|
||||
<description>The file that properties will be written to</description>
|
||||
</parameter>
|
||||
<parameter>
|
||||
<name>project</name>
|
||||
<type>org.apache.maven.project.MavenProject</type>
|
||||
<required>true</required>
|
||||
<editable>false</editable>
|
||||
<description></description>
|
||||
</parameter>
|
||||
<parameter>
|
||||
<name>properties.quiet</name>
|
||||
<type>boolean</type>
|
||||
<required>false</required>
|
||||
<editable>true</editable>
|
||||
<description>If true, the plugin will silently ignore any non-existent properties files, and the build will continue</description>
|
||||
</parameter>
|
||||
</parameters>
|
||||
<configuration>
|
||||
<properties.escapeChars implementation="java.lang.String" default-value="cr,lf,tab"/>
|
||||
<properties.includeEnvironmentVariables implementation="boolean" default-value="false"/>
|
||||
<properties.includeSystemProperties implementation="boolean" default-value="false"/>
|
||||
<properties.outputFile implementation="java.io.File" default-value="${project.build.directory}/properties/project.properties"/>
|
||||
<project implementation="org.apache.maven.project.MavenProject" default-value="${project}"/>
|
||||
<properties.quiet implementation="boolean" default-value="true"/>
|
||||
</configuration>
|
||||
</mojo>
|
||||
</mojos>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven</groupId>
|
||||
<artifactId>maven-plugin-api</artifactId>
|
||||
<type>jar</type>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven</groupId>
|
||||
<artifactId>maven-project</artifactId>
|
||||
<type>jar</type>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven</groupId>
|
||||
<artifactId>maven-profile</artifactId>
|
||||
<type>jar</type>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven</groupId>
|
||||
<artifactId>maven-model</artifactId>
|
||||
<type>jar</type>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven</groupId>
|
||||
<artifactId>maven-artifact-manager</artifactId>
|
||||
<type>jar</type>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven</groupId>
|
||||
<artifactId>maven-repository-metadata</artifactId>
|
||||
<type>jar</type>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven.wagon</groupId>
|
||||
<artifactId>wagon-provider-api</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.0-alpha-5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.plexus</groupId>
|
||||
<artifactId>plexus-utils</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.0.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.maven</groupId>
|
||||
<artifactId>maven-artifact</artifactId>
|
||||
<type>jar</type>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.plexus</groupId>
|
||||
<artifactId>plexus-container-default</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.0-alpha-8</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>classworlds</groupId>
|
||||
<artifactId>classworlds</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.1-alpha-2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.kuali.maven.plugins</groupId>
|
||||
<artifactId>properties-maven-plugin</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.3.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-core</artifactId>
|
||||
<type>jar</type>
|
||||
<version>3.1.1.RELEASE</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-asm</artifactId>
|
||||
<type>jar</type>
|
||||
<version>3.1.1.RELEASE</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jasypt</groupId>
|
||||
<artifactId>jasypt</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.9.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.kuali.maven.common</groupId>
|
||||
<artifactId>maven-kuali-common</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.2.8</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.ant</groupId>
|
||||
<artifactId>ant</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.8.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.ant</groupId>
|
||||
<artifactId>ant-launcher</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.8.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.plexus</groupId>
|
||||
<artifactId>plexus-interpolation</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.15</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-lang</groupId>
|
||||
<artifactId>commons-lang</artifactId>
|
||||
<type>jar</type>
|
||||
<version>2.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<type>jar</type>
|
||||
<version>2.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>jcl-over-slf4j</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.6.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.7.22</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.7.22</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
<type>jar</type>
|
||||
<version>1.2.17</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>javax.servlet</groupId>
|
||||
<artifactId>javax.servlet-api</artifactId>
|
||||
<type>jar</type>
|
||||
<version>3.1.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</plugin>
|
Binary file not shown.
Binary file not shown.
|
@ -1,2 +0,0 @@
|
|||
eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.class
|
||||
eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.class
|
|
@ -1,2 +0,0 @@
|
|||
/Users/claudio/workspace/dnet-hadoop/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java
|
||||
/Users/claudio/workspace/dnet-hadoop/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java
|
|
@ -9,8 +9,7 @@
|
|||
<artifactId>dhp-build</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
<modules>
|
||||
<module>dhp-build-assembly-resources</module>
|
||||
<module>dhp-build-properties-maven-plugin</module>
|
||||
|
||||
</modules>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -3,13 +3,15 @@
|
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-parent</artifactId>
|
||||
<version>2.1.3.RELEASE</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dhp-collector-worker</artifactId>
|
||||
<version>1.0.0</version>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -1,5 +1,12 @@
|
|||
package eu.dnetlib.collector.worker;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.collector.worker.plugins.CollectorPlugin;
|
||||
import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
|
@ -10,21 +17,21 @@ import org.springframework.boot.CommandLineRunner;
|
|||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.collector.worker.plugins.CollectorPlugin;
|
||||
import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* DnetCollectortWorkerApplication is the main class responsible to start
|
||||
* the Dnet Collection into HDFS.
|
||||
* This module will be executed on the hadoop cluster and taking in input some parameters
|
||||
* that tells it which is the right collector plugin to use and where store the data into HDFS path
|
||||
*
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
@SpringBootApplication
|
||||
public class DnetCollectorWorkerApplication implements CommandLineRunner {
|
||||
|
||||
|
@ -42,7 +49,9 @@ public class DnetCollectorWorkerApplication implements CommandLineRunner {
|
|||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* This module expect two arguments:
|
||||
* path hdfs where store the sequential file.
|
||||
* Json serialization of {@link ApiDescriptor}
|
||||
*/
|
||||
@Override
|
||||
public void run(final String... args) throws Exception {
|
||||
|
@ -70,12 +79,9 @@ public class DnetCollectorWorkerApplication implements CommandLineRunner {
|
|||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
// Set HADOOP user
|
||||
System.setProperty("HADOOP_USER_NAME", "sandro.labruzzo");
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
//Get the filesystem - HDFS
|
||||
FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf);
|
||||
|
||||
Path hdfswritepath = new Path(hdfsPath);
|
||||
|
||||
log.info("Created path "+hdfswritepath.toString());
|
||||
|
|
|
@ -15,163 +15,10 @@
|
|||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.oozie</groupId>
|
||||
<artifactId>oozie-core</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.10</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.10</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro-mapred</artifactId>
|
||||
<classifier>hadoop2</classifier>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<!-- required by caching mechanism for setting chmod -->
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-beans</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.beust</groupId>
|
||||
<artifactId>jcommander</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pig</groupId>
|
||||
<artifactId>pig</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.linkedin.datafu</groupId>
|
||||
<artifactId>datafu</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-beanutils</groupId>
|
||||
<artifactId>commons-beanutils</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.jdom</groupId>
|
||||
<artifactId>jdom</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
|
||||
<!-- Plugin that generates Java classes from Avro schemas -->
|
||||
<plugin>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-test-sources</phase>
|
||||
<goals>
|
||||
<goal>schema</goal>
|
||||
<goal>idl-protocol</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<stringType>String</stringType>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>add-test-sources</id>
|
||||
<phase>generate-test-sources</phase>
|
||||
<goals>
|
||||
<goal>add-test-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>${project.build.directory}/generated-test-sources/avro/</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<!-- Plugin that generates jar with test classes -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<excludedGroups>eu.dnetlib.iis.common.IntegrationTest</excludedGroups>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-failsafe-plugin</artifactId>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -1,106 +0,0 @@
|
|||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import org.apache.hadoop.conf.Configurable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FsShell;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.util.ClassUtils;
|
||||
import org.springframework.util.ReflectionUtils;
|
||||
|
||||
/**
|
||||
* Extracted from:
|
||||
* https://github.com/spring-projects/spring-hadoop/blob/master/spring-hadoop-core/src/main/java/org/springframework/data/hadoop/fs/FsShellPermissions.java
|
||||
*
|
||||
* Utility class for accessing Hadoop FsShellPermissions (which is not public)
|
||||
* without having to duplicate its code.
|
||||
* @author Costin Leau
|
||||
*
|
||||
*/
|
||||
public class FsShellPermissions {
|
||||
|
||||
private static boolean IS_HADOOP_20X = ClassUtils.isPresent("org.apache.hadoop.fs.FsShellPermissions$Chmod",
|
||||
FsShellPermissions.class.getClassLoader());
|
||||
|
||||
public enum Op {
|
||||
CHOWN("-chown"), CHMOD("-chmod"), CHGRP("-chgrp");
|
||||
|
||||
private final String cmd;
|
||||
|
||||
Op(String cmd) {
|
||||
this.cmd = cmd;
|
||||
}
|
||||
|
||||
public String getCmd() {
|
||||
return cmd;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: move this into Spring Core (but add JDK 1.5 compatibility first)
|
||||
@SafeVarargs
|
||||
static <T> T[] concatAll(T[] first, T[]... rest) {
|
||||
// can add some sanity checks
|
||||
int totalLength = first.length;
|
||||
for (T[] array : rest) {
|
||||
totalLength += array.length;
|
||||
}
|
||||
T[] result = Arrays.copyOf(first, totalLength);
|
||||
int offset = first.length;
|
||||
for (T[] array : rest) {
|
||||
System.arraycopy(array, 0, result, offset, array.length);
|
||||
offset += array.length;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static void changePermissions(FileSystem fs, Configuration config,
|
||||
Op op, boolean recursive, String group, String uri) {
|
||||
changePermissions(fs, config, op, recursive, group, new String[] {uri});
|
||||
}
|
||||
|
||||
public static void changePermissions(FileSystem fs, Configuration config,
|
||||
Op op, boolean recursive, String group, String... uris) {
|
||||
String[] argvs;
|
||||
if (recursive) {
|
||||
argvs = new String[1];
|
||||
argvs[0] = "-R";
|
||||
} else {
|
||||
argvs = new String[0];
|
||||
}
|
||||
argvs = concatAll(argvs, new String[] { group }, uris);
|
||||
|
||||
// Hadoop 1.0.x
|
||||
if (!IS_HADOOP_20X) {
|
||||
Class<?> cls = ClassUtils.resolveClassName("org.apache.hadoop.fs.FsShellPermissions", config.getClass().getClassLoader());
|
||||
Object[] args = new Object[] { fs, op.getCmd(), argvs, 0, new FsShell(config) };
|
||||
|
||||
Method m = ReflectionUtils.findMethod(cls, "changePermissions", FileSystem.class, String.class, String[].class, int.class, FsShell.class);
|
||||
ReflectionUtils.makeAccessible(m);
|
||||
ReflectionUtils.invokeMethod(m, null, args);
|
||||
}
|
||||
// Hadoop 2.x
|
||||
else {
|
||||
Class<?> cmd = ClassUtils.resolveClassName("org.apache.hadoop.fs.shell.Command", config.getClass().getClassLoader());
|
||||
Class<?> targetClz = ClassUtils.resolveClassName("org.apache.hadoop.fs.FsShellPermissions$Chmod", config.getClass().getClassLoader());
|
||||
Configurable target = (Configurable) BeanUtils.instantiate(targetClz);
|
||||
target.setConf(config);
|
||||
// run(String...) swallows the exceptions - re-implement it here
|
||||
//
|
||||
LinkedList<String> args = new LinkedList<String>(Arrays.asList(argvs));
|
||||
try {
|
||||
Method m = ReflectionUtils.findMethod(cmd, "processOptions", LinkedList.class);
|
||||
ReflectionUtils.makeAccessible(m);
|
||||
ReflectionUtils.invokeMethod(m, target, args);
|
||||
m = ReflectionUtils.findMethod(cmd, "processRawArguments", LinkedList.class);
|
||||
ReflectionUtils.makeAccessible(m);
|
||||
ReflectionUtils.invokeMethod(m, target, args);
|
||||
} catch (IllegalStateException ex){
|
||||
throw new RuntimeException("Cannot change permissions/ownership " + ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,75 +0,0 @@
|
|||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
/**
|
||||
* InfoSpaceConstants constants.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public final class InfoSpaceConstants {
|
||||
|
||||
public static final float CONFIDENCE_TO_TRUST_LEVEL_FACTOR = 0.9f;
|
||||
|
||||
public static final String ENCODING_UTF8 = "utf-8";
|
||||
|
||||
public static final char ROW_PREFIX_SEPARATOR = '|';
|
||||
|
||||
public static final String ID_NAMESPACE_SEPARATOR = "::";
|
||||
public static final String CLASSIFICATION_HIERARCHY_SEPARATOR = ID_NAMESPACE_SEPARATOR;
|
||||
public static final String INFERENCE_PROVENANCE_SEPARATOR = ID_NAMESPACE_SEPARATOR;
|
||||
|
||||
public static final String ROW_PREFIX_RESULT = "50|";
|
||||
public static final String ROW_PREFIX_PROJECT = "40|";
|
||||
public static final String ROW_PREFIX_PERSON = "30|";
|
||||
public static final String ROW_PREFIX_ORGANIZATION = "20|";
|
||||
public static final String ROW_PREFIX_DATASOURCE = "10|";
|
||||
|
||||
public static final String QUALIFIER_BODY_STRING = "body";
|
||||
public static final byte[] QUALIFIER_BODY;
|
||||
|
||||
public static final String SEMANTIC_CLASS_MAIN_TITLE = "main title";
|
||||
public static final String SEMANTIC_CLASS_PUBLICATION = "publication";
|
||||
public static final String SEMANTIC_CLASS_UNKNOWN = "UNKNOWN";
|
||||
|
||||
public static final String SEMANTIC_SCHEME_DNET_PERSON_ROLES = "dnet:personroles";
|
||||
public static final String SEMANTIC_SCHEME_DNET_RELATIONS_RESULT_RESULT = "dnet:result_result_relations";
|
||||
public static final String SEMANTIC_SCHEME_DNET_RELATIONS_RESULT_PROJECT = "dnet:result_project_relations";
|
||||
|
||||
public static final String SEMANTIC_SCHEME_DNET_TITLE = "dnet:dataCite_title";
|
||||
public static final String SEMANTIC_SCHEME_DNET_TITLE_TYPOLOGIES = "dnet:title_typologies";
|
||||
public static final String SEMANTIC_SCHEME_DNET_RESULT_TYPOLOGIES = "dnet:result_typologies";
|
||||
public static final String SEMANTIC_SCHEME_DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
|
||||
public static final String SEMANTIC_SCHEME_DNET_LANGUAGES = "dnet:languages";
|
||||
public static final String SEMANTIC_SCHEME_DNET_PID_TYPES = "dnet:pid_types";
|
||||
public static final String SEMANTIC_SCHEME_DNET_CLASSIFICATION_TAXONOMIES = "dnet:subject_classification_typologies";
|
||||
|
||||
// resultResult citation and similarity related
|
||||
public static final String SEMANTIC_SCHEME_DNET_DATASET_PUBLICATION_RELS = "dnet:dataset_publication_rels";
|
||||
|
||||
public static final String SEMANTIC_CLASS_TAXONOMIES_ARXIV = "arxiv";
|
||||
public static final String SEMANTIC_CLASS_TAXONOMIES_WOS = "wos";
|
||||
public static final String SEMANTIC_CLASS_TAXONOMIES_DDC = "ddc";
|
||||
public static final String SEMANTIC_CLASS_TAXONOMIES_MESHEUROPMC = "mesheuropmc";
|
||||
public static final String SEMANTIC_CLASS_TAXONOMIES_ACM = "acm";
|
||||
|
||||
public static final String EXTERNAL_ID_TYPE_INSTANCE_URL = "dnet:instance-url";
|
||||
public static final String EXTERNAL_ID_TYPE_UNKNOWN = "unknown";
|
||||
|
||||
// publication types class ids
|
||||
public static final String SEMANTIC_CLASS_INSTANCE_TYPE_ARTICLE = "0001";
|
||||
public static final String SEMANTIC_CLASS_INSTANCE_TYPE_DATASET = "0021";
|
||||
|
||||
static {
|
||||
try {
|
||||
QUALIFIER_BODY = QUALIFIER_BODY_STRING.getBytes(ENCODING_UTF8);
|
||||
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private InfoSpaceConstants() {
|
||||
}
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
/**
|
||||
* Utility class holding parameter names and method simplifying access to parameters from hadoop context.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public final class WorkflowRuntimeParameters {
|
||||
|
||||
public static final String OOZIE_ACTION_OUTPUT_FILENAME = "oozie.action.output.properties";
|
||||
|
||||
public static final char DEFAULT_CSV_DELIMITER = ',';
|
||||
|
||||
public static final String UNDEFINED_NONEMPTY_VALUE = "$UNDEFINED$";
|
||||
|
||||
// default values
|
||||
public static final String DNET_SERVICE_READ_TIMEOUT_DEFAULT_VALUE = "60000";
|
||||
public static final String DNET_SERVICE_CONNECTION_TIMEOUT_DEFAULT_VALUE = "60000";
|
||||
// parameter names
|
||||
public static final String DNET_SERVICE_CLIENT_READ_TIMEOUT = "dnet.service.client.read.timeout";
|
||||
public static final String DNET_SERVICE_CLIENT_CONNECTION_TIMEOUT = "dnet.service.client.connection.timeout";
|
||||
|
||||
// ----------------- CONSTRUCTORS -----------------------------
|
||||
|
||||
private WorkflowRuntimeParameters() {}
|
||||
|
||||
/**
|
||||
* Retrieves parameter from hadoop context configuration when set to value different than {@link WorkflowRuntimeParameters#UNDEFINED_NONEMPTY_VALUE}.
|
||||
*/
|
||||
public static String getParamValue(String paramName, Configuration configuration) {
|
||||
String paramValue = configuration.get(paramName);
|
||||
if (StringUtils.isNotBlank(paramValue) && !UNDEFINED_NONEMPTY_VALUE.equals(paramValue)) {
|
||||
return paramValue;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves {@link Integer} parameter from hadoop context configuration when set to non-empty value different than {@link WorkflowRuntimeParameters#UNDEFINED_NONEMPTY_VALUE}.
|
||||
* Null is returned when parameter was not set.
|
||||
* @throws {@link NumberFormatException} if parameter value does not contain a parsable integer
|
||||
*/
|
||||
public static Integer getIntegerParamValue(String paramName, Configuration configuration) throws NumberFormatException {
|
||||
String paramValue = getParamValue(paramName, configuration);
|
||||
return paramValue!=null?Integer.valueOf(paramValue):null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves parameter from hadoop context configuration when set to value different than {@link WorkflowRuntimeParameters#UNDEFINED_NONEMPTY_VALUE}.
|
||||
* If requested parameter was not set, fallback parameter is retrieved using the same logic.
|
||||
*/
|
||||
public static String getParamValue(String paramName, String fallbackParamName, Configuration configuration) {
|
||||
String resultCandidate = getParamValue(paramName, configuration);
|
||||
return resultCandidate!=null?resultCandidate:getParamValue(fallbackParamName, configuration);
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides parameter value. Returns default value when entry not found among parameters.
|
||||
*
|
||||
* @param paramName parameter name
|
||||
* @param defaultValue parameter default value to be returned when entry not found among parameters
|
||||
* @param parameters map of parameters
|
||||
*/
|
||||
public static String getParamValue(String paramName, String defaultValue, Map<String, String> parameters) {
|
||||
return parameters.containsKey(paramName)?parameters.get(paramName):defaultValue;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,111 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.counter;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
/**
|
||||
* Class that groups several counters which are identified by name (<code>String</code> value).
|
||||
*
|
||||
* @author madryk
|
||||
*/
|
||||
public class NamedCounters implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
|
||||
private final Map<String, Long> counters;
|
||||
|
||||
|
||||
//------------------------ CONSTRUCTORS --------------------------
|
||||
|
||||
/**
|
||||
* Creates {@link NamedCounters} with empty initial counters.
|
||||
*/
|
||||
public NamedCounters() {
|
||||
this.counters = Maps.newHashMap();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@link NamedCounters} with initial counters.<br/>
|
||||
* Starting value of initial counters is zero.
|
||||
*
|
||||
* @param initialCounterNames - names of initial counters
|
||||
*/
|
||||
public NamedCounters(String[] initialCounterNames) {
|
||||
Preconditions.checkNotNull(initialCounterNames);
|
||||
|
||||
this.counters = Maps.newHashMap();
|
||||
|
||||
for (String initialCounterName : initialCounterNames) {
|
||||
this.counters.put(initialCounterName, 0L);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@link NamedCounters} with initial counters.<br/>
|
||||
* Starting value of initial counters is zero.
|
||||
*
|
||||
* @param initialCounterNamesEnumClass - enum class providing names of initial counters
|
||||
*/
|
||||
public <E extends Enum<E>> NamedCounters(Class<E> initialCounterNamesEnumClass) {
|
||||
Preconditions.checkNotNull(initialCounterNamesEnumClass);
|
||||
|
||||
this.counters = Maps.newHashMap();
|
||||
Enum<?>[] enumConstants = initialCounterNamesEnumClass.getEnumConstants();
|
||||
|
||||
for (int i=0; i<enumConstants.length; ++i) {
|
||||
this.counters.put(enumConstants[i].name(), 0L);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Increments value by 1 of a counter with the name specified as parameter.<br/>
|
||||
* Internally uses {@link #increment(String, Long)}
|
||||
*/
|
||||
public void increment(String counterName) {
|
||||
increment(counterName, 1L);
|
||||
}
|
||||
|
||||
/**
|
||||
* Increments value of a counter with the name specified as parameter by the given value.<br/>
|
||||
* If current instance of {@link NamedCounters} does not contain counter
|
||||
* with provided name, then before incrementing counter will be created with starting
|
||||
* value equal to zero.
|
||||
*/
|
||||
public void increment(String counterName, Long incrementValue) {
|
||||
|
||||
long oldValue = counters.getOrDefault(counterName, 0L);
|
||||
counters.put(counterName, oldValue + incrementValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns current value of a counter with the name specified as parameter.
|
||||
*
|
||||
* @throws IllegalArgumentException when {@link NamedCounters} does not contain counter
|
||||
* with provided name
|
||||
*/
|
||||
public long currentValue(String counterName) {
|
||||
|
||||
if (!counters.containsKey(counterName)) {
|
||||
throw new IllegalArgumentException("Couldn't find counter with name: " + counterName);
|
||||
}
|
||||
|
||||
return counters.get(counterName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns names of currently tracked counters.
|
||||
*/
|
||||
public Collection<String> counterNames() {
|
||||
return counters.keySet();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.counter;
|
||||
|
||||
import org.apache.spark.AccumulableParam;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Spark {@link AccumulableParam} for tracking multiple counter values using {@link NamedCounters}.
|
||||
*
|
||||
* @author madryk
|
||||
*/
|
||||
public class NamedCountersAccumulableParam implements AccumulableParam<NamedCounters, Tuple2<String,Long>> {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Increments {@link NamedCounters} counter with the name same as the first element of passed incrementValue tuple
|
||||
* by value defined in the second element of incrementValue tuple.
|
||||
*/
|
||||
@Override
|
||||
public NamedCounters addAccumulator(NamedCounters counters, Tuple2<String, Long> incrementValue) {
|
||||
counters.increment(incrementValue._1, incrementValue._2);
|
||||
return counters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges two passed {@link NamedCounters}.
|
||||
*/
|
||||
@Override
|
||||
public NamedCounters addInPlace(NamedCounters counters1, NamedCounters counters2) {
|
||||
for (String counterName2 : counters2.counterNames()) {
|
||||
counters1.increment(counterName2, counters2.currentValue(counterName2));
|
||||
}
|
||||
return counters1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns passed initialCounters value without any modifications.
|
||||
*/
|
||||
@Override
|
||||
public NamedCounters zero(NamedCounters initialCounters) {
|
||||
return initialCounters;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,52 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.counter;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Writer of {@link NamedCounters} object into a properties file.
|
||||
*
|
||||
* @author madryk
|
||||
*/
|
||||
public class NamedCountersFileWriter {
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Writes {@link NamedCounters} as a properties file located under
|
||||
* provided filePath.
|
||||
*
|
||||
* @throws IOException if writing to properties file resulted in an error
|
||||
*/
|
||||
public void writeCounters(NamedCounters counters, String filePath) throws IOException {
|
||||
|
||||
Properties counterProperties = buildPropertiesFromCounters(counters);
|
||||
|
||||
File file = new File(filePath);
|
||||
try (OutputStream os = new FileOutputStream(file)) {
|
||||
|
||||
counterProperties.store(os, null);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
//------------------------ PRIVATE --------------------------
|
||||
|
||||
private Properties buildPropertiesFromCounters(NamedCounters counters) {
|
||||
|
||||
Properties properties = new Properties();
|
||||
|
||||
for (String counterName : counters.counterNames()) {
|
||||
long count = counters.currentValue(counterName);
|
||||
properties.put(counterName, String.valueOf(count));
|
||||
}
|
||||
|
||||
return properties;
|
||||
}
|
||||
}
|
|
@ -1,67 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.fault;
|
||||
|
||||
import java.io.PrintWriter;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.dhp.audit.schemas.Cause;
|
||||
import eu.dnetlib.dhp.audit.schemas.Fault;
|
||||
|
||||
/**
|
||||
* {@link Fault} related utilities.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public final class FaultUtils {
|
||||
|
||||
// ---------------------- CONSTRUCTORS -------------------
|
||||
|
||||
private FaultUtils() {}
|
||||
|
||||
// ---------------------- LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Generates {@link Fault} instance based on {@link Throwable}.
|
||||
* @param entityId entity identifier
|
||||
* @param throwable
|
||||
* @param auditSupplementaryData
|
||||
* @return {@link Fault} instance generated for {@link Throwable}
|
||||
*/
|
||||
public static Fault exceptionToFault(CharSequence entityId, Throwable throwable,
|
||||
Map<CharSequence, CharSequence> auditSupplementaryData) {
|
||||
Fault.Builder faultBuilder = Fault.newBuilder();
|
||||
faultBuilder.setInputObjectId(entityId);
|
||||
faultBuilder.setTimestamp(System.currentTimeMillis());
|
||||
faultBuilder.setCode(throwable.getClass().getName());
|
||||
faultBuilder.setMessage(throwable.getMessage());
|
||||
StringWriter strWriter = new StringWriter();
|
||||
PrintWriter pw = new PrintWriter(strWriter);
|
||||
throwable.printStackTrace(pw);
|
||||
pw.close();
|
||||
faultBuilder.setStackTrace(strWriter.toString());
|
||||
if (throwable.getCause()!=null) {
|
||||
faultBuilder.setCauses(appendThrowableToCauses(
|
||||
throwable.getCause(), new ArrayList<Cause>()));
|
||||
}
|
||||
if (auditSupplementaryData!=null && !auditSupplementaryData.isEmpty()) {
|
||||
faultBuilder.setSupplementaryData(auditSupplementaryData);
|
||||
}
|
||||
return faultBuilder.build();
|
||||
}
|
||||
|
||||
protected static List<Cause> appendThrowableToCauses(Throwable e, List<Cause> causes) {
|
||||
Cause.Builder causeBuilder = Cause.newBuilder();
|
||||
causeBuilder.setCode(e.getClass().getName());
|
||||
causeBuilder.setMessage(e.getMessage());
|
||||
causes.add(causeBuilder.build());
|
||||
if (e.getCause()!=null) {
|
||||
return appendThrowableToCauses(
|
||||
e.getCause(),causes);
|
||||
} else {
|
||||
return causes;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,98 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
import org.apache.commons.cli.CommandLineParser;
|
||||
import org.apache.commons.cli.GnuParser;
|
||||
import org.apache.commons.cli.HelpFormatter;
|
||||
import org.apache.commons.cli.Option;
|
||||
import org.apache.commons.cli.OptionBuilder;
|
||||
import org.apache.commons.cli.Options;
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public final class CmdLineParser {
|
||||
/** HACK: make the names of various types of parameters of the program
|
||||
* more readable, e.g. "--Input_person=..." instead of "-Iperson=...",
|
||||
* "--Output_merged=..." instead of "-Omerged=...". I wasn't able to
|
||||
* get such notation so far using the Apache CLI. */
|
||||
public static final String constructorPrefix = "C";
|
||||
public static final String inputPrefix = "I";
|
||||
public static final String outputPrefix = "O";
|
||||
public static final String specialParametersPrefix = "S";
|
||||
/** HACK: This field should be removed since this list of special
|
||||
* parameters is empty, thus not used anywhere.*/
|
||||
public static final String[] mandatorySpecialParameters = new String[]{};
|
||||
public static final String processParametersPrefix = "P";
|
||||
|
||||
// ------------------------- CONSTRUCTORS ------------------------------
|
||||
|
||||
private CmdLineParser() {}
|
||||
|
||||
// ------------------------- LOGIC -------------------------------------
|
||||
|
||||
public static CommandLine parse(String[] args) {
|
||||
Options options = new Options();
|
||||
@SuppressWarnings("static-access")
|
||||
Option constructorParams = OptionBuilder.withArgName("STRING")
|
||||
.hasArg()
|
||||
.withDescription("Constructor parameter")
|
||||
.withLongOpt("ConstructorParam")
|
||||
.create(constructorPrefix);
|
||||
options.addOption(constructorParams);
|
||||
@SuppressWarnings("static-access")
|
||||
Option inputs = OptionBuilder.withArgName("portName=URI")
|
||||
.hasArgs(2)
|
||||
.withValueSeparator()
|
||||
.withDescription("Path binding for a given input port")
|
||||
.withLongOpt("Input")
|
||||
.create(inputPrefix);
|
||||
options.addOption(inputs);
|
||||
@SuppressWarnings("static-access")
|
||||
Option outputs = OptionBuilder.withArgName("portName=URI")
|
||||
.hasArgs(2)
|
||||
.withValueSeparator()
|
||||
.withDescription("Path binding for a given output port")
|
||||
.create(outputPrefix);
|
||||
options.addOption(outputs);
|
||||
@SuppressWarnings("static-access")
|
||||
Option specialParameter = OptionBuilder.withArgName("parameter_name=string")
|
||||
.hasArgs(2)
|
||||
.withValueSeparator()
|
||||
.withDescription(String.format("Value of special parameter. "
|
||||
+ "These are the mandatory parameters={%s}",
|
||||
StringUtils.join(mandatorySpecialParameters, ",")))
|
||||
.create(specialParametersPrefix);
|
||||
options.addOption(specialParameter);
|
||||
@SuppressWarnings("static-access")
|
||||
Option otherParameter = OptionBuilder.withArgName("parameter_name=string")
|
||||
.hasArgs(2)
|
||||
.withValueSeparator()
|
||||
.withDescription(
|
||||
String.format("Value of some other parameter."))
|
||||
.create(processParametersPrefix);
|
||||
options.addOption(otherParameter);
|
||||
|
||||
Option help = new Option("help", "print this message");
|
||||
options.addOption(help);
|
||||
|
||||
CommandLineParser parser = new GnuParser();
|
||||
try {
|
||||
CommandLine cmdLine = parser.parse(options, args);
|
||||
if(cmdLine.hasOption("help")){
|
||||
HelpFormatter formatter = new HelpFormatter();
|
||||
formatter.printHelp("", options );
|
||||
System.exit(1);
|
||||
}
|
||||
return cmdLine;
|
||||
} catch (ParseException e) {
|
||||
throw new CmdLineParserException("Parsing command line arguments failed", e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,21 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
/**
|
||||
* Command line parsing exception
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public class CmdLineParserException extends RuntimeException {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 9219928547611876284L;
|
||||
|
||||
public CmdLineParserException(String message){
|
||||
super(message);
|
||||
}
|
||||
|
||||
public CmdLineParserException(String message, Throwable cause){
|
||||
super(message, cause);
|
||||
}
|
||||
}
|
|
@ -1,45 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
import java.lang.reflect.Constructor;
|
||||
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
|
||||
/**
|
||||
* Handles parsing the command line arguments provided by the Oozie
|
||||
* to create a {@link Process}
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public class CmdLineParserForProcessConstruction {
|
||||
public Process run(CommandLine cmdLine){
|
||||
String[] args = cmdLine.getArgs();
|
||||
if(args.length != 1){
|
||||
throw new CmdLineParserException("The name of the class has "+
|
||||
"to be specified as the first agrument");
|
||||
}
|
||||
String className = args[0];
|
||||
|
||||
String[] constructorParams = cmdLine.getOptionValues(
|
||||
CmdLineParser.constructorPrefix);
|
||||
if(constructorParams == null){
|
||||
constructorParams = new String[0];
|
||||
}
|
||||
try {
|
||||
Class<?> processClass = Class.forName(className);
|
||||
Constructor<?> processConstructor = null;
|
||||
if(constructorParams.length == 0){
|
||||
try{
|
||||
processConstructor = processClass.getConstructor();
|
||||
return (Process) processConstructor.newInstance();
|
||||
} catch(NoSuchMethodException ex){
|
||||
}
|
||||
}
|
||||
processConstructor = processClass.getConstructor(String[].class);
|
||||
return (Process) processConstructor.newInstance(
|
||||
(Object)constructorParams);
|
||||
} catch (Exception e) {
|
||||
throw new CmdLineParserException(String.format(
|
||||
"Problem while creating class \"%s\"", className), e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,100 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
/**
|
||||
* Handles parsing parameters passed to the {@link Process}
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public class CmdLineParserForProcessRunParameters {
|
||||
/** Parse the command line arguments.
|
||||
*
|
||||
* @param cmdLine command line arguments
|
||||
* @param ports names of ports that ought to be extracted from command line
|
||||
*/
|
||||
public ProcessParameters run(CommandLine cmdLine, Ports ports) {
|
||||
|
||||
Properties inputProperties = cmdLine.getOptionProperties(
|
||||
CmdLineParser.inputPrefix);
|
||||
assumePortNamesMatch(CmdLineParser.inputPrefix, inputProperties,
|
||||
ports.getInput().keySet());
|
||||
Map<String, Path> inputBindings = getBindings(
|
||||
inputProperties, ports.getInput().keySet());
|
||||
|
||||
Properties outputProperties = cmdLine.getOptionProperties(
|
||||
CmdLineParser.outputPrefix);
|
||||
assumePortNamesMatch(CmdLineParser.outputPrefix, outputProperties,
|
||||
ports.getOutput().keySet());
|
||||
Map<String, Path> outputBindings = getBindings(
|
||||
outputProperties, ports.getOutput().keySet());
|
||||
|
||||
PortBindings bindings = new PortBindings(inputBindings, outputBindings);
|
||||
|
||||
Properties specialProperties = cmdLine.getOptionProperties(
|
||||
CmdLineParser.specialParametersPrefix);
|
||||
assumeContainAllMandatoryParameters(
|
||||
specialProperties, CmdLineParser.mandatorySpecialParameters);
|
||||
|
||||
Properties rawProperties = cmdLine.getOptionProperties(
|
||||
CmdLineParser.processParametersPrefix);
|
||||
Map<String, String> processParameters = new HashMap<String, String>();
|
||||
for(Entry<Object, Object> entry: rawProperties.entrySet()){
|
||||
processParameters.put(
|
||||
(String)entry.getKey(), (String)entry.getValue());
|
||||
}
|
||||
|
||||
return new ProcessParameters(bindings, processParameters);
|
||||
}
|
||||
|
||||
private static void assumeContainAllMandatoryParameters(
|
||||
Properties properties, String[] mandatoryParameters){
|
||||
for(String otherParameter: mandatoryParameters){
|
||||
if(!properties.containsKey(otherParameter)){
|
||||
throw new CmdLineParserException(String.format(
|
||||
"Not all mandatory properties are set using the \"%s\" "
|
||||
+ "option are given, e.g. \"-%s\" parameter is missing",
|
||||
CmdLineParser.specialParametersPrefix, otherParameter));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void assumePortNamesMatch(String cmdLineParamPrefix,
|
||||
Properties cmdLineProperties, Set<String> portNames) {
|
||||
for (String name : portNames) {
|
||||
if (!cmdLineProperties.containsKey(name)) {
|
||||
throw new CmdLineParserException(String.format(
|
||||
"The port with name \"%s\" is not specified in "
|
||||
+ "command line (command line option \"-%s\" is missing)",
|
||||
name, cmdLineParamPrefix + name));
|
||||
}
|
||||
}
|
||||
for (Object cmdLineKeyObject : cmdLineProperties.keySet()) {
|
||||
String name = (String) cmdLineKeyObject;
|
||||
if (!portNames.contains(name)) {
|
||||
throw new CmdLineParserException(String.format(
|
||||
"A port name \"%s\" which is not specified is given "
|
||||
+ "in the command line "
|
||||
+ "(command line option \"%s\" is excess)",
|
||||
name, cmdLineParamPrefix + name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, Path> getBindings(
|
||||
Properties cmdLineProperties, Set<String> portNames) {
|
||||
Map<String, Path> bindings = new HashMap<String, Path>();
|
||||
for (String name : portNames) {
|
||||
Path path = new Path((String) cmdLineProperties.get(name));
|
||||
bindings.put(name, path);
|
||||
}
|
||||
return bindings;
|
||||
}
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang.NotImplementedException;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
/**
|
||||
* Port names (see {@link Ports}) bound to certain paths in the file system
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public class PortBindings {
|
||||
private final Map<String, Path> input;
|
||||
private final Map<String, Path> output;
|
||||
|
||||
public PortBindings(Map<String, Path> input, Map<String, Path> output) {
|
||||
this.input = input;
|
||||
this.output = output;
|
||||
}
|
||||
|
||||
public Map<String, Path> getInput() {
|
||||
return input;
|
||||
}
|
||||
|
||||
public Map<String, Path> getOutput() {
|
||||
return output;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o){
|
||||
if(!(o instanceof PortBindings)){
|
||||
return false;
|
||||
}
|
||||
PortBindings other = (PortBindings) o;
|
||||
return input.equals(other.input) && output.equals(other.output);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode(){
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.dhp.common.java.porttype.PortType;
|
||||
|
||||
/**
|
||||
* A class that groups information about input and output ports, i.e.
|
||||
* their (name of the port -> type of the port) mappings.
|
||||
* @author Mateusz Kobos
|
||||
*/
|
||||
public class Ports {
|
||||
private final Map<String, PortType> input;
|
||||
private final Map<String, PortType> output;
|
||||
|
||||
public Ports(Map<String, PortType> input, Map<String, PortType> output){
|
||||
this.input = input;
|
||||
this.output = output;
|
||||
}
|
||||
|
||||
public Map<String, PortType> getInput() {
|
||||
return input;
|
||||
}
|
||||
public Map<String, PortType> getOutput() {
|
||||
return output;
|
||||
}
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import eu.dnetlib.dhp.common.java.porttype.PortType;
|
||||
|
||||
/** Workflow node written in Java.
|
||||
*
|
||||
* The implementing class has to define a constructor with no parameters
|
||||
* (possibly the default one) or a constructor with String[] as a single
|
||||
* parameter.
|
||||
* @author Mateusz Kobos
|
||||
*/
|
||||
public interface Process {
|
||||
/**
|
||||
* Run the process.
|
||||
*
|
||||
* The process ends with a success status if no exception is thrown,
|
||||
* otherwise it ends with an error status.
|
||||
*
|
||||
* @param parameters parameters of the process. Each parameter
|
||||
* corresponds to a single entry in the map, its name is the key, its
|
||||
* value is the value.
|
||||
* @throws Exception if thrown, it means that the process finished
|
||||
* with an error status
|
||||
*/
|
||||
void run(PortBindings portBindings, Configuration conf,
|
||||
Map<String, String> parameters) throws Exception;
|
||||
|
||||
/**
|
||||
* @return map containing as the key: name of the port, as the value: type
|
||||
* of the port
|
||||
*/
|
||||
Map<String, PortType> getInputPorts();
|
||||
|
||||
/**
|
||||
* @return map containing as the key: name of the port, as the value: type
|
||||
* of the port
|
||||
*/
|
||||
Map<String, PortType> getOutputPorts();
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
/**
|
||||
* Process exception
|
||||
* @author Dominika Tkaczyk
|
||||
*
|
||||
*/
|
||||
public class ProcessException extends RuntimeException {
|
||||
|
||||
private static final long serialVersionUID = 2758953138374438377L;
|
||||
|
||||
public ProcessException(String message){
|
||||
super(message);
|
||||
}
|
||||
|
||||
public ProcessException(String message, Throwable cause){
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang.NotImplementedException;
|
||||
|
||||
/**
|
||||
* Parameters of the Process retrieved from Oozie
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public class ProcessParameters {
|
||||
private final PortBindings portBindings;
|
||||
private final Map<String, String> parameters;
|
||||
|
||||
public PortBindings getPortBindings() {
|
||||
return portBindings;
|
||||
}
|
||||
|
||||
public Map<String, String> getParameters(){
|
||||
return parameters;
|
||||
}
|
||||
|
||||
public ProcessParameters(PortBindings portBindings,
|
||||
Map<String, String> parameters) {
|
||||
this.portBindings = portBindings;
|
||||
this.parameters = parameters;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o){
|
||||
if(!(o instanceof ProcessParameters)){
|
||||
return false;
|
||||
}
|
||||
ProcessParameters other = (ProcessParameters) o;
|
||||
return this.portBindings.equals(other.portBindings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode(){
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
/**
|
||||
* {@link Process} related utility class.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public final class ProcessUtils {
|
||||
|
||||
// ------------- CONSTRUCTORS ----------------
|
||||
|
||||
private ProcessUtils() {}
|
||||
|
||||
// ------------- LOGIC -----------------------
|
||||
|
||||
/**
|
||||
* Returns parameter value retrived from parameters or context.
|
||||
* @param paramName
|
||||
* @param hadoopConf
|
||||
* @param parameters
|
||||
* @return parameter value
|
||||
*/
|
||||
public static String getParameterValue(String paramName,
|
||||
Configuration hadoopConf,
|
||||
Map<String, String> parameters) {
|
||||
if (parameters!=null && !parameters.isEmpty()) {
|
||||
String result = null;
|
||||
result = parameters.get(paramName);
|
||||
if (result!=null) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
if (hadoopConf!=null) {
|
||||
return hadoopConf.get(paramName);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,88 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.avro.file.DataFileWriter;
|
||||
import org.apache.avro.generic.GenericContainer;
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
|
||||
import eu.dnetlib.dhp.common.java.io.DataStore;
|
||||
import eu.dnetlib.dhp.common.java.io.FileSystemPath;
|
||||
import eu.dnetlib.dhp.common.java.porttype.AvroPortType;
|
||||
import eu.dnetlib.dhp.common.java.porttype.PortType;
|
||||
|
||||
/**
|
||||
* Creates {@link Process} object through reflection by parsing
|
||||
* the command-line arguments
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public class ProcessWrapper {
|
||||
|
||||
public Configuration getConfiguration() throws Exception{
|
||||
return new Configuration();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
ProcessWrapper wrapper = new ProcessWrapper();
|
||||
wrapper.run(args);
|
||||
}
|
||||
|
||||
public void run(String[] args) throws Exception{
|
||||
CommandLine cmdLine = CmdLineParser.parse(args);
|
||||
|
||||
CmdLineParserForProcessConstruction constructionParser =
|
||||
new CmdLineParserForProcessConstruction();
|
||||
Process process = constructionParser.run(cmdLine);
|
||||
Ports ports =
|
||||
new Ports(process.getInputPorts(), process.getOutputPorts());
|
||||
CmdLineParserForProcessRunParameters runParametersParser =
|
||||
new CmdLineParserForProcessRunParameters();
|
||||
ProcessParameters params = runParametersParser.run(cmdLine, ports);
|
||||
Configuration conf = getConfiguration();
|
||||
process.run(params.getPortBindings(), conf, params.getParameters());
|
||||
createOutputsIfDontExist(
|
||||
process.getOutputPorts(), params.getPortBindings().getOutput(),
|
||||
conf);
|
||||
}
|
||||
|
||||
private static void createOutputsIfDontExist(
|
||||
Map<String, PortType> outputPortsSpecification,
|
||||
Map<String, Path> outputPortBindings, Configuration conf) throws IOException{
|
||||
FileSystem fs = FileSystem.get(conf);
|
||||
for(Map.Entry<String, Path> entry: outputPortBindings.entrySet()){
|
||||
Path path = entry.getValue();
|
||||
if(!fs.exists(path) || isEmptyDirectory(fs, path)){
|
||||
PortType rawType = outputPortsSpecification.get(entry.getKey());
|
||||
if(!(rawType instanceof AvroPortType)){
|
||||
throw new RuntimeException("The port \""+entry.getKey()+
|
||||
"\" is not of Avro type and only Avro types are "+
|
||||
"supported");
|
||||
}
|
||||
AvroPortType type = (AvroPortType) rawType;
|
||||
FileSystemPath fsPath = new FileSystemPath(fs, path);
|
||||
DataFileWriter<GenericContainer> writer =
|
||||
DataStore.create(fsPath, type.getSchema());
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isEmptyDirectory(FileSystem fs, Path path) throws IOException{
|
||||
if(!fs.isDirectory(path)){
|
||||
return false;
|
||||
}
|
||||
RemoteIterator<LocatedFileStatus> files = fs.listFiles(path, false);
|
||||
/** There's at least one file, so the directory is not empty */
|
||||
if(files.hasNext()){
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -1,156 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.io;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.file.DataFileReader;
|
||||
import org.apache.avro.specific.SpecificDatumReader;
|
||||
import org.apache.hadoop.fs.AvroFSInput;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* An abstraction over data store format which allows
|
||||
* iterating over records stored in the data store.
|
||||
* It handles the standard case of a data store that is a directory containing
|
||||
* many Avro files (but it can also read records from a single file).
|
||||
*
|
||||
* @author mhorst
|
||||
* @author Mateusz Kobos
|
||||
*/
|
||||
class AvroDataStoreReader<T> implements CloseableIterator<T> {
|
||||
|
||||
private DataFileReader<T> currentReader;
|
||||
private RemoteIterator<LocatedFileStatus> fileIterator;
|
||||
private final FileSystemPath path;
|
||||
private final Schema readerSchema;
|
||||
|
||||
/**
|
||||
* Ignore file starting with underscore. Such files are also ignored by
|
||||
* default by map-reduce jobs.
|
||||
*/
|
||||
private final Pattern whitelistPattern = Pattern.compile("^(?!_).*");
|
||||
|
||||
/**
|
||||
* Here the schema used for reading the data store is set to be the same
|
||||
* as the one that was used to write it.
|
||||
*/
|
||||
public AvroDataStoreReader(final FileSystemPath path)
|
||||
throws IOException {
|
||||
this(path, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param path path to the data store to be read
|
||||
* @param readerSchema the schema onto which the read data store will
|
||||
* be projected
|
||||
*/
|
||||
public AvroDataStoreReader(final FileSystemPath path, Schema readerSchema)
|
||||
throws IOException {
|
||||
this.path = path;
|
||||
this.readerSchema = readerSchema;
|
||||
fileIterator = path.getFileSystem().listFiles(path.getPath(), false);
|
||||
currentReader = getNextNonemptyReader();
|
||||
}
|
||||
|
||||
private DataFileReader<T> getNextNonemptyReader() throws IOException {
|
||||
while (fileIterator != null && fileIterator.hasNext()) {
|
||||
LocatedFileStatus currentFileStatus = fileIterator.next();
|
||||
if (isValidFile(currentFileStatus)) {
|
||||
FileSystemPath currPath = new FileSystemPath(
|
||||
path.getFileSystem(), currentFileStatus.getPath());
|
||||
DataFileReader<T> reader =
|
||||
getSingleFileReader(currPath, readerSchema);
|
||||
/** Check if the file contains at least one record */
|
||||
if(reader.hasNext()){
|
||||
return reader;
|
||||
} else {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
/** fallback */
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a reader for the specified Avro file. A utility function.
|
||||
* @param path path to the existing file
|
||||
* @param readerSchema optional reader schema. If you want to use the
|
||||
* default option of using writer schema as the reader schema, pass the
|
||||
* {@code null} value.
|
||||
* @throws IOException
|
||||
*/
|
||||
private static <T> DataFileReader<T> getSingleFileReader(
|
||||
FileSystemPath path, Schema readerSchema) throws IOException{
|
||||
try{
|
||||
SpecificDatumReader<T> datumReader = new SpecificDatumReader<T>();
|
||||
if(readerSchema != null){
|
||||
datumReader.setExpected(readerSchema);
|
||||
}
|
||||
long len = path.getFileSystem().getFileStatus(path.getPath()).getLen();
|
||||
FSDataInputStream inputStream = path.getFileSystem().open(path.getPath());
|
||||
return new DataFileReader<T>(
|
||||
new AvroFSInput(inputStream, len), datumReader);
|
||||
} catch (IOException ex){
|
||||
throw new IOException("Problem with file \""+
|
||||
path.getPath().toString()+"\": "+ex.getMessage(), ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether file is valid
|
||||
*
|
||||
* @param fileStatus
|
||||
* @return true when valid, false otherwise
|
||||
*/
|
||||
private boolean isValidFile(LocatedFileStatus fileStatus) {
|
||||
if (fileStatus.isFile()) {
|
||||
return whitelistPattern.matcher(
|
||||
fileStatus.getPath().getName()).matches();
|
||||
}
|
||||
/** fallback */
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return currentReader != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next(){
|
||||
if(currentReader == null){
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
T obj = currentReader.next();
|
||||
if(!currentReader.hasNext()){
|
||||
try{
|
||||
currentReader.close();
|
||||
currentReader = getNextNonemptyReader();
|
||||
} catch(IOException ex){
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if(currentReader != null){
|
||||
currentReader.close();
|
||||
currentReader = null;
|
||||
}
|
||||
fileIterator = null;
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.io;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* An iterator for I/O operations that can be {@code close}d explicitly to
|
||||
* release the resources it holds.
|
||||
*
|
||||
* You should call {@code close} only when interrupting the iteration in the
|
||||
* middle since in such situation there is no way for the iterator to know if
|
||||
* you're going to continue the iteration and it should still hold the resources
|
||||
* or not. There's no need to call {@code close} when iterating over all
|
||||
* elements since in such situation it is called automatically after the
|
||||
* end of iteration.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
* @param <E>
|
||||
*/
|
||||
public interface CloseableIterator<E> extends Iterator<E>, Closeable {
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.io;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Counting iterator providing total number of results.
|
||||
* @author mhorst
|
||||
*
|
||||
* @param <E>
|
||||
*/
|
||||
public interface CountingIterator<E> extends Iterator<E> {
|
||||
|
||||
/**
|
||||
* Provides total number of results to be iterating on.
|
||||
* @return total number of results to be iterating on
|
||||
*/
|
||||
int getCount();
|
||||
|
||||
}
|
|
@ -1,172 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.io;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.file.DataFileWriter;
|
||||
import org.apache.avro.generic.GenericContainer;
|
||||
import org.apache.avro.io.DatumWriter;
|
||||
import org.apache.avro.specific.SpecificDatumWriter;
|
||||
|
||||
|
||||
/**
|
||||
* Utility for accessing to Avro-based data stores stored in file system
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public final class DataStore {
|
||||
|
||||
private final static String singleDataStoreFileName = "content.avro";
|
||||
|
||||
private static final int FILE_NO_PADDING_LENGTH = 7;
|
||||
|
||||
private DataStore(){}
|
||||
|
||||
/**
|
||||
* Create a new data store directory with single file and return writer that allows
|
||||
* adding new records
|
||||
* @param path path to a directory to be created
|
||||
* @param schema schema of the records to be stored in the file
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public static <T> DataFileWriter<T> create(
|
||||
FileSystemPath path, Schema schema) throws IOException{
|
||||
return create(path, schema, singleDataStoreFileName);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a new data store directory and return writer that allows
|
||||
* adding new records
|
||||
* @param path path to a directory to be created
|
||||
* @param schema schema of the records to be stored in the file
|
||||
* @param dataStoreFileName datastore file name
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public static <T> DataFileWriter<T> create(
|
||||
FileSystemPath path, Schema schema, String dataStoreFileName) throws IOException{
|
||||
path.getFileSystem().mkdirs(path.getPath());
|
||||
FileSystemPath outFile = new FileSystemPath(
|
||||
path, dataStoreFileName);
|
||||
return DataStore.createSingleFile(outFile, schema);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get reader for reading records from given data store
|
||||
*
|
||||
* Here the schema used for reading the data store is set to be the same
|
||||
* as the one that was used to write it.
|
||||
*
|
||||
* @see getReader(FileSystemPath path, Schema readerSchema) for details.
|
||||
*
|
||||
*/
|
||||
public static <T> CloseableIterator<T> getReader(FileSystemPath path)
|
||||
throws IOException{
|
||||
return getReader(path, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get reader for reading records from given data store
|
||||
* @param path path to a directory corresponding to data store
|
||||
* @param readerSchema the schema onto which the read data store will
|
||||
* be projected
|
||||
*/
|
||||
public static <T> CloseableIterator<T> getReader(
|
||||
FileSystemPath path, Schema readerSchema) throws IOException{
|
||||
return new AvroDataStoreReader<T>(path, readerSchema);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read data store entries and insert them into a list. A utility function.
|
||||
*
|
||||
* Here the schema used for reading the data store is set to be the same
|
||||
* as the one that was used to write it.
|
||||
*/
|
||||
public static <T> List<T> read(FileSystemPath path)
|
||||
throws IOException{
|
||||
return read(path, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read data store entries and insert them into a list. A utility function.
|
||||
*
|
||||
* @param readerSchema the schema onto which the read data store will
|
||||
* be projected
|
||||
*/
|
||||
public static <T> List<T> read(FileSystemPath path, Schema readerSchema)
|
||||
throws IOException{
|
||||
CloseableIterator<T> iterator = getReader(path, readerSchema);
|
||||
List<T> elems = new ArrayList<T>();
|
||||
while(iterator.hasNext()){
|
||||
elems.add(iterator.next());
|
||||
}
|
||||
return elems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a data store from a list of entries. A utility function.
|
||||
* The schema is implicitly
|
||||
* taken from the first element from the {@code elements} list.
|
||||
* @param elements list of elements to write. At least one element has
|
||||
* to be present, because it is used to retrieve schema of the
|
||||
* structures passed in the list.
|
||||
*/
|
||||
public static <T extends GenericContainer> void create(
|
||||
List<T> elements, FileSystemPath path) throws IOException{
|
||||
if(elements.isEmpty()){
|
||||
throw new IllegalArgumentException(
|
||||
"The list of elements has to be non-empty");
|
||||
}
|
||||
Schema schema = elements.get(0).getSchema();
|
||||
create(elements, path, schema);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a data store from a list of entries with schema given explicitly.
|
||||
* A utility function.
|
||||
*/
|
||||
public static <T extends GenericContainer> void create(
|
||||
List<T> elements, FileSystemPath path, Schema schema)
|
||||
throws IOException{
|
||||
DataFileWriter<T> writer = create(path, schema);
|
||||
try{
|
||||
for(T i: elements){
|
||||
writer.append(i);
|
||||
}
|
||||
} finally {
|
||||
if(writer != null){
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a single Avro file. This method shouldn't be normally used to
|
||||
* create data stores since it creates only a single Avro file,
|
||||
* while a data store consists of a directory containing one or more files.
|
||||
*/
|
||||
public static <T> DataFileWriter<T> createSingleFile(
|
||||
FileSystemPath path, Schema schema) throws IOException{
|
||||
DatumWriter<T> datumWriter = new SpecificDatumWriter<T>();
|
||||
DataFileWriter<T> writer = new DataFileWriter<T>(datumWriter);
|
||||
writer.create(schema, path.getFileSystem().create(path.getPath()));
|
||||
return writer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates filename for given file number.
|
||||
* @param fileNo file sequence number
|
||||
*/
|
||||
public static String generateFileName(int fileNo) {
|
||||
StringBuffer strBuff = new StringBuffer(String.valueOf(fileNo));
|
||||
while(strBuff.length()<FILE_NO_PADDING_LENGTH) {
|
||||
strBuff.insert(0, '0');
|
||||
}
|
||||
strBuff.append(".avro");
|
||||
return strBuff.toString();
|
||||
}
|
||||
}
|
|
@ -1,61 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.io;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
/**
|
||||
* Path to a directory or a file along with information about the
|
||||
* file system in which the path is defined.
|
||||
*
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public class FileSystemPath {
|
||||
private final FileSystem fs;
|
||||
private final Path path;
|
||||
|
||||
/**
|
||||
* Path in the local file system
|
||||
*/
|
||||
public FileSystemPath(File file) throws IOException {
|
||||
this(new Path(file.toURI()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Path in the local file system
|
||||
*/
|
||||
public FileSystemPath(Path path) throws IOException{
|
||||
this(FileSystem.get(new Configuration(false)), path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Path in the given file system
|
||||
*/
|
||||
public FileSystemPath(FileSystem fs, Path path){
|
||||
this.fs = fs;
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
/** Create a path with a child element */
|
||||
public FileSystemPath(FileSystemPath parent, String child){
|
||||
this.fs = parent.getFileSystem();
|
||||
this.path = new Path(parent.getPath(), child);
|
||||
}
|
||||
|
||||
public FileSystem getFileSystem() {
|
||||
return fs;
|
||||
}
|
||||
|
||||
public Path getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
public FSDataInputStream getInputStream() throws IOException{
|
||||
return getFileSystem().open(getPath());
|
||||
}
|
||||
}
|
|
@ -1,37 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.io;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
/**
|
||||
* Util class containing operations on hdfs or local filesystem
|
||||
*
|
||||
* @author madryk
|
||||
*/
|
||||
public final class HdfsUtils {
|
||||
|
||||
//------------------------ CONSTRUCTORS -------------------
|
||||
|
||||
private HdfsUtils() {}
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Removes file or directory (recursively) located under the specified pathname.
|
||||
*/
|
||||
public static void remove(Configuration hadoopConf, String pathname) throws IOException {
|
||||
|
||||
Path path = new Path(pathname);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(hadoopConf);
|
||||
|
||||
if (fileSystem.exists(path)) {
|
||||
fileSystem.delete(path, true);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,159 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.io;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.SequenceFile.Reader;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.Writable;
|
||||
import org.apache.hadoop.util.ReflectionUtils;
|
||||
|
||||
/**
|
||||
* Iterator that extracts sequence file's consecutive {@link Text} values.
|
||||
*
|
||||
* @author mhorst
|
||||
*/
|
||||
public class SequenceFileTextValueReader implements CloseableIterator<Text> {
|
||||
|
||||
private SequenceFile.Reader sequenceReader;
|
||||
|
||||
private final RemoteIterator<LocatedFileStatus> fileIt;
|
||||
|
||||
private final FileSystem fs;
|
||||
|
||||
/**
|
||||
* Ignore file starting with underscore. Such files are also ignored by
|
||||
* default by map-reduce jobs.
|
||||
*/
|
||||
private final static Pattern WHITELIST_REGEXP = Pattern.compile("^[^_].*");
|
||||
|
||||
private Text toBeReturned;
|
||||
|
||||
//------------------------ CONSTRUCTORS --------------------------
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*
|
||||
* @param path HDFS path along with associated FileSystem
|
||||
* @throws IOException
|
||||
*/
|
||||
public SequenceFileTextValueReader(final FileSystemPath path) throws IOException {
|
||||
this.fs = path.getFileSystem();
|
||||
if (fs.isDirectory(path.getPath())) {
|
||||
fileIt = fs.listFiles(path.getPath(), false);
|
||||
sequenceReader = getNextSequenceReader();
|
||||
} else {
|
||||
fileIt = null;
|
||||
sequenceReader = new Reader(fs.getConf(), SequenceFile.Reader.file(path.getPath()));
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------ LOGIC ---------------------------------
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#hasNext()
|
||||
*/
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
// check and provide next when already returned
|
||||
if (toBeReturned == null) {
|
||||
toBeReturned = getNext();
|
||||
}
|
||||
return toBeReturned != null;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#next()
|
||||
*/
|
||||
@Override
|
||||
public Text next() {
|
||||
if (toBeReturned != null) {
|
||||
// element fetched while executing hasNext()
|
||||
Text result = toBeReturned;
|
||||
toBeReturned = null;
|
||||
return result;
|
||||
} else {
|
||||
Text resultCandidate = getNext();
|
||||
if (resultCandidate!=null) {
|
||||
return resultCandidate;
|
||||
} else {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.dhp.exp.iterator.ClosableIterator#close()
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (sequenceReader != null) {
|
||||
sequenceReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------ PRIVATE -------------------------------
|
||||
|
||||
private final Reader getNextSequenceReader() throws IOException {
|
||||
while (fileIt != null && fileIt.hasNext()) {
|
||||
LocatedFileStatus currentFileStatus = fileIt.next();
|
||||
if (isValidFile(currentFileStatus)) {
|
||||
return new Reader(this.fs.getConf(), SequenceFile.Reader.file(currentFileStatus.getPath()));
|
||||
}
|
||||
}
|
||||
// fallback
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether file is valid candidate.
|
||||
*
|
||||
* @param fileStatus
|
||||
* file status holding file name
|
||||
* @return true when valid, false otherwise
|
||||
*/
|
||||
private final boolean isValidFile(LocatedFileStatus fileStatus) {
|
||||
if (fileStatus.isFile()) {
|
||||
return WHITELIST_REGEXP.matcher(fileStatus.getPath().getName()).matches();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return next data package
|
||||
*/
|
||||
private Text getNext() {
|
||||
try {
|
||||
if (sequenceReader == null) {
|
||||
return null;
|
||||
}
|
||||
Writable key = (Writable) ReflectionUtils.newInstance(sequenceReader.getKeyClass(), fs.getConf());
|
||||
Writable value = (Writable) ReflectionUtils.newInstance(sequenceReader.getValueClass(), fs.getConf());
|
||||
if (sequenceReader.next(key, value)) {
|
||||
return (Text) value;
|
||||
} else {
|
||||
sequenceReader.close();
|
||||
sequenceReader = getNextSequenceReader();
|
||||
if (sequenceReader != null) {
|
||||
return getNext();
|
||||
}
|
||||
}
|
||||
// fallback
|
||||
return null;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,54 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.jsonworkflownodes;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import eu.dnetlib.dhp.common.java.PortBindings;
|
||||
import eu.dnetlib.dhp.common.java.Process;
|
||||
import eu.dnetlib.dhp.common.java.porttype.PortType;
|
||||
|
||||
/**
|
||||
* Utility class responsible for copying resources available on classpath to specified HDFS location.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public class ClassPathResourceToHdfsCopier implements Process {
|
||||
|
||||
private static final String PARAM_INPUT_CLASSPATH_RESOURCE = "inputClasspathResource";
|
||||
|
||||
private static final String PARAM_OUTPUT_HDFS_FILE_LOCATION = "outputHdfsFileLocation";
|
||||
|
||||
@Override
|
||||
public void run(PortBindings portBindings, Configuration conf, Map<String, String> parameters) throws Exception {
|
||||
Preconditions.checkNotNull(parameters.get(PARAM_INPUT_CLASSPATH_RESOURCE), PARAM_INPUT_CLASSPATH_RESOURCE + " parameter was not specified!");
|
||||
Preconditions.checkNotNull(parameters.get(PARAM_OUTPUT_HDFS_FILE_LOCATION), PARAM_OUTPUT_HDFS_FILE_LOCATION + " parameter was not specified!");
|
||||
|
||||
FileSystem fs = FileSystem.get(conf);
|
||||
|
||||
try (InputStream in = Thread.currentThread().getContextClassLoader()
|
||||
.getResourceAsStream(parameters.get(PARAM_INPUT_CLASSPATH_RESOURCE));
|
||||
OutputStream os = fs.create(new Path(parameters.get(PARAM_OUTPUT_HDFS_FILE_LOCATION)))) {
|
||||
IOUtils.copyBytes(in, os, 4096, false);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getInputPorts() {
|
||||
return new HashMap<String, PortType>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getOutputPorts() {
|
||||
return new HashMap<String, PortType>();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,66 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.jsonworkflownodes;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
|
||||
import eu.dnetlib.dhp.common.java.jsonworkflownodes.StringPortSpecificationExtractor.PortSpecification;
|
||||
import eu.dnetlib.dhp.common.java.porttype.AvroPortType;
|
||||
import eu.dnetlib.dhp.common.java.porttype.PortType;
|
||||
import eu.dnetlib.dhp.common.utils.AvroUtils;
|
||||
|
||||
/**
|
||||
* @author Mateusz Kobos
|
||||
*/
|
||||
public class PortSpecifications {
|
||||
private static final String[] propertyRegexps =
|
||||
new String[]{"[\\w\\.]+", "[\\w\\./_\\-]+"};
|
||||
private final Map<String, SpecificationValues> specs;
|
||||
|
||||
public static class SpecificationValues {
|
||||
|
||||
private final Schema schema;
|
||||
|
||||
private final String jsonFilePath;
|
||||
|
||||
public SpecificationValues(Schema schema, String jsonFilePath) {
|
||||
this.schema = schema;
|
||||
this.jsonFilePath = jsonFilePath;
|
||||
}
|
||||
|
||||
public Schema getSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
public String getJsonFilePath() {
|
||||
return jsonFilePath;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public PortSpecifications(String[] portSpecifications){
|
||||
StringPortSpecificationExtractor portSpecExtractor =
|
||||
new StringPortSpecificationExtractor(propertyRegexps);
|
||||
specs = new HashMap<String, SpecificationValues>();
|
||||
for(int i = 0; i < portSpecifications.length; i++){
|
||||
PortSpecification portSpec = portSpecExtractor.getSpecification(portSpecifications[i]);
|
||||
Schema schema = AvroUtils.toSchema(portSpec.getProperties()[0]);
|
||||
String jsonPath = portSpec.getProperties()[1];
|
||||
specs.put(portSpec.getName(), new SpecificationValues(schema, jsonPath));
|
||||
}
|
||||
}
|
||||
|
||||
public SpecificationValues get(String portName){
|
||||
return specs.get(portName);
|
||||
}
|
||||
|
||||
public Map<String, PortType> getPortTypes(){
|
||||
Map<String, PortType> ports = new HashMap<String, PortType>();
|
||||
for(Map.Entry<String, SpecificationValues> e: specs.entrySet()){
|
||||
Schema schema = e.getValue().schema;
|
||||
ports.put(e.getKey(), new AvroPortType(schema));
|
||||
}
|
||||
return ports;
|
||||
}
|
||||
}
|
|
@ -1,89 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.jsonworkflownodes;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Extracts information about port name and its properties from a string
|
||||
* of a form "{port_name, property_1, property_2, ...}"
|
||||
* @author Mateusz Kobos
|
||||
*/
|
||||
public class StringPortSpecificationExtractor {
|
||||
private final String[] propertiesRegexp;
|
||||
private final String portSpecificationRegexp;
|
||||
private final Pattern pattern;
|
||||
|
||||
public static class PortSpecification {
|
||||
|
||||
private final String name;
|
||||
|
||||
private final String[] properties;
|
||||
|
||||
public PortSpecification(String name, String[] properties) {
|
||||
this.name = name;
|
||||
this.properties = properties;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String[] getProperties() {
|
||||
return properties;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param propertiesRegexp regular expressions specifying pattern for
|
||||
* each of the properties associated with a port. An example of a single
|
||||
* specification: {@code "[\\w\\.]+"}.
|
||||
*/
|
||||
public StringPortSpecificationExtractor(String[] propertiesRegexp){
|
||||
this.propertiesRegexp = propertiesRegexp;
|
||||
this.portSpecificationRegexp = createRegexpString("[\\w\\._]+", propertiesRegexp);
|
||||
this.pattern = Pattern.compile(this.portSpecificationRegexp);
|
||||
}
|
||||
|
||||
private static String createRegexpString(String portNameRegexp, String[] propertiesRegexp){
|
||||
StringBuilder regexp = new StringBuilder();
|
||||
regexp.append("s*\\{\\s*");
|
||||
regexp.append("("+portNameRegexp+")");
|
||||
for(String propertyRegexp: propertiesRegexp){
|
||||
regexp.append(",\\s*("+propertyRegexp+")");
|
||||
}
|
||||
regexp.append("\\s*\\}\\s*");
|
||||
return regexp.toString();
|
||||
}
|
||||
|
||||
private int getPropertiesCount(){
|
||||
return propertiesRegexp.length;
|
||||
}
|
||||
|
||||
public PortSpecification getSpecification(String text){
|
||||
Matcher m = pattern.matcher(text);
|
||||
if(!m.matches()){
|
||||
throw new RuntimeException(String.format("Specification of " +
|
||||
"the port (\"%s\") does not match regexp \"%s\"",
|
||||
text, portSpecificationRegexp));
|
||||
}
|
||||
final int expectedGroupsCount = getPropertiesCount()+1;
|
||||
if(m.groupCount() != expectedGroupsCount){
|
||||
StringBuilder groups = new StringBuilder();
|
||||
for(int i = 0; i < m.groupCount(); i++){
|
||||
groups.append("\""+m.group(i)+"\"");
|
||||
if(i != m.groupCount()-1) {
|
||||
groups.append(", ");
|
||||
}
|
||||
}
|
||||
throw new RuntimeException(String.format(
|
||||
"Invalid output port specification \"%s\": got %d groups "+
|
||||
"instead of %d (namely: %s)", text, m.groupCount(),
|
||||
expectedGroupsCount, groups.toString()));
|
||||
}
|
||||
String[] properties = new String[getPropertiesCount()];
|
||||
for(int i = 0; i < getPropertiesCount(); i++){
|
||||
properties[i] = m.group(i+2);
|
||||
}
|
||||
return new PortSpecification(m.group(1), properties);
|
||||
}
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.porttype;
|
||||
|
||||
/**
|
||||
* A port type that accepts any type of data
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public class AnyPortType implements PortType {
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "Any";
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accepts(PortType other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,65 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.porttype;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.commons.lang.NotImplementedException;
|
||||
|
||||
/**
|
||||
* This port type accepts data stores in a format of Avro
|
||||
* Object Container Files, i.e. Avro data files.
|
||||
* This kind of file corresponds to a list of objects, each one being of the
|
||||
* same type, i.e. each one is defined by the same Avro schema.
|
||||
* @author Mateusz Kobos
|
||||
*/
|
||||
public class AvroPortType implements PortType {
|
||||
|
||||
private final Schema schema;
|
||||
|
||||
|
||||
public AvroPortType(Schema schema) {
|
||||
this.schema = schema;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return schema.getFullName();
|
||||
}
|
||||
|
||||
@Override
|
||||
/** Simple check if the port types are exactly the same
|
||||
* (as defined by the {@code equals} method).
|
||||
*
|
||||
* TODO: this should work in a more relaxed way -
|
||||
* {@code this.accepts(other)} should be true if {@code this}
|
||||
* describes a subset of structures defined in {@code other}. To be
|
||||
* more precise: the JSON schema tree tree defined by {@code this} should
|
||||
* form a sub-tree of the JSON schema tree defined by {@code other}. */
|
||||
public boolean accepts(PortType other) {
|
||||
return this.equals(other);
|
||||
}
|
||||
|
||||
/**
|
||||
* Two patterns are equal if their schemas are the same.
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(Object o){
|
||||
if(!(o instanceof AvroPortType)){
|
||||
return false;
|
||||
}
|
||||
AvroPortType other = (AvroPortType) o;
|
||||
return this.schema.equals(other.schema);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode(){
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns avro schema.
|
||||
* @return avro schema
|
||||
*/
|
||||
public Schema getSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.java.porttype;
|
||||
|
||||
/**
|
||||
* Type of the port. This is used to specify what kind of data is
|
||||
* accepted on a certain input port or produced on a certain output port
|
||||
* of a workflow node.
|
||||
*
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public interface PortType {
|
||||
|
||||
String getName();
|
||||
|
||||
/**
|
||||
* This should be used to check whether data produced by a workflow node
|
||||
* conforms to the data consumed by other workflow node.
|
||||
* In a scenario when A produces certain data on a port p and B consumes
|
||||
* this data on a port q, type(q).accepts(type(p)) has to be true.
|
||||
*
|
||||
* @return {@code true} if {@code this} port type is a more general
|
||||
* version of the {@code other} port type,
|
||||
* or as an alternative explanation: {@code other} is a subset of
|
||||
* {@code this}, i.e. {@code other} has at least all the properties present
|
||||
* in {@code this} (and possibly some others). This is analogous to a
|
||||
* situation in object-oriented programming, where in order for assignment
|
||||
* operation {@code this = other} to work, the type of {@code this} has to
|
||||
* accept type of {@code other}, or in other words {@code other} has to
|
||||
* inherit from {@code this}, or in yet other words: {@code other} has to
|
||||
* conform to {@code this}.
|
||||
*/
|
||||
boolean accepts(PortType other);
|
||||
}
|
|
@ -1,149 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.lock;
|
||||
|
||||
import java.security.InvalidParameterException;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.ha.ZKFailoverController;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.zookeeper.CreateMode;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.apache.zookeeper.Watcher.Event;
|
||||
import org.apache.zookeeper.ZooDefs;
|
||||
import org.apache.zookeeper.ZooKeeper;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Stopwatch;
|
||||
|
||||
import eu.dnetlib.dhp.common.java.PortBindings;
|
||||
import eu.dnetlib.dhp.common.java.porttype.PortType;
|
||||
|
||||
/**
|
||||
* Zookeeper lock managing process. Blocks until lock is released.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public class LockManagingProcess implements eu.dnetlib.dhp.common.java.Process {
|
||||
|
||||
public static final String DEFAULT_ROOT_NODE = "/cache";
|
||||
|
||||
public static final String NODE_SEPARATOR = "/";
|
||||
|
||||
public static final String PARAM_ZK_SESSION_TIMEOUT = "zk_session_timeout";
|
||||
|
||||
public static final String PARAM_NODE_ID = "node_id";
|
||||
|
||||
public static final String PARAM_LOCK_MODE = "mode";
|
||||
|
||||
public static enum LockMode {
|
||||
obtain,
|
||||
release
|
||||
}
|
||||
|
||||
public static final int DEFAULT_SESSION_TIMEOUT = 60000;
|
||||
|
||||
public static final Logger log = Logger.getLogger(LockManagingProcess.class);
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getInputPorts() {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getOutputPorts() {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(PortBindings portBindings, Configuration conf,
|
||||
Map<String, String> parameters) throws Exception {
|
||||
|
||||
Preconditions.checkArgument(parameters.containsKey(PARAM_NODE_ID), "node id not provided!");
|
||||
Preconditions.checkArgument(parameters.containsKey(PARAM_LOCK_MODE), "lock mode not provided!");
|
||||
|
||||
String zkConnectionString = conf.get(ZKFailoverController.ZK_QUORUM_KEY);
|
||||
Preconditions.checkArgument(StringUtils.isNotBlank(zkConnectionString),
|
||||
"zookeeper quorum is unknown, invalid '%s' property value: %s", ZKFailoverController.ZK_QUORUM_KEY, zkConnectionString);
|
||||
|
||||
int sessionTimeout = parameters.containsKey(PARAM_ZK_SESSION_TIMEOUT)?
|
||||
Integer.valueOf(parameters.get(PARAM_ZK_SESSION_TIMEOUT)) : DEFAULT_SESSION_TIMEOUT;
|
||||
|
||||
final ZooKeeper zooKeeper = new ZooKeeper(zkConnectionString, sessionTimeout, (e) -> {
|
||||
// we are not interested in generic events
|
||||
});
|
||||
|
||||
// initializing root node if does not exist
|
||||
if (zooKeeper.exists(DEFAULT_ROOT_NODE, false) == null) {
|
||||
log.info("initializing root node: " + DEFAULT_ROOT_NODE);
|
||||
zooKeeper.create(DEFAULT_ROOT_NODE, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
|
||||
log.info("root node initialized");
|
||||
}
|
||||
|
||||
final String nodePath = generatePath(parameters.get(PARAM_NODE_ID), DEFAULT_ROOT_NODE);
|
||||
|
||||
final Semaphore semaphore = new Semaphore(1);
|
||||
semaphore.acquire();
|
||||
|
||||
switch(LockMode.valueOf(parameters.get(PARAM_LOCK_MODE))) {
|
||||
case obtain: {
|
||||
obtain(zooKeeper, nodePath, semaphore);
|
||||
break;
|
||||
}
|
||||
case release: {
|
||||
release(zooKeeper, nodePath);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
throw new InvalidParameterException("unsupported lock mode: " + parameters.get(PARAM_LOCK_MODE));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------- PRIVATE --------------------------
|
||||
|
||||
private void obtain(final ZooKeeper zooKeeper, final String nodePath, final Semaphore semaphore) throws KeeperException, InterruptedException {
|
||||
log.info("trying to obtain lock: " + nodePath);
|
||||
if (zooKeeper.exists(nodePath, (event) -> {
|
||||
if (Event.EventType.NodeDeleted == event.getType()) {
|
||||
try {
|
||||
log.info(nodePath + " lock release detected");
|
||||
log.info("creating new lock instance: " + nodePath + "...");
|
||||
zooKeeper.create(nodePath, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
|
||||
log.info("lock" + nodePath + " created");
|
||||
semaphore.release();
|
||||
} catch (KeeperException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}) == null) {
|
||||
log.info("lock not found, creating new lock instance: " + nodePath);
|
||||
zooKeeper.create(nodePath, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
|
||||
log.info("lock" + nodePath + " created");
|
||||
semaphore.release();
|
||||
} else {
|
||||
// waiting until node is removed by other lock manager
|
||||
log.info("waiting until lock is released");
|
||||
Stopwatch timer = new Stopwatch().start();
|
||||
semaphore.acquire();
|
||||
log.info("lock released, waited for " + timer.elapsedMillis() + " ms");
|
||||
semaphore.release();
|
||||
}
|
||||
}
|
||||
|
||||
private void release(final ZooKeeper zooKeeper, final String nodePath) throws InterruptedException, KeeperException {
|
||||
log.info("removing lock" + nodePath + "...");
|
||||
zooKeeper.delete(nodePath, -1);
|
||||
log.info("lock" + nodePath + " removed");
|
||||
}
|
||||
|
||||
private static final String generatePath(String nodeId, String rootNode) {
|
||||
return rootNode + NODE_SEPARATOR + nodeId.replace('/', '_');
|
||||
}
|
||||
|
||||
}
|
|
@ -1,24 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.oozie;
|
||||
|
||||
import org.apache.oozie.client.OozieClient;
|
||||
|
||||
/**
|
||||
* Factory of {@link OozieClient}
|
||||
*
|
||||
* @author madryk
|
||||
*/
|
||||
public class OozieClientFactory {
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Returns {@link OozieClient} object used for communication with oozie
|
||||
*/
|
||||
public OozieClient createOozieClient(String oozieUrl) {
|
||||
|
||||
OozieClient oozieClient = new OozieClient(oozieUrl);
|
||||
|
||||
return oozieClient;
|
||||
}
|
||||
}
|
|
@ -1,76 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.oozie.property;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import eu.dnetlib.dhp.common.java.PortBindings;
|
||||
import eu.dnetlib.dhp.common.java.Process;
|
||||
import eu.dnetlib.dhp.common.java.porttype.PortType;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import static eu.dnetlib.dhp.common.WorkflowRuntimeParameters.OOZIE_ACTION_OUTPUT_FILENAME;
|
||||
|
||||
/**
|
||||
* This process is a solution for setting dynamic properties in oozie workflow definition.
|
||||
*
|
||||
* Expects three parameters to be provided: the first 'condition' parameter is boolean value
|
||||
* based on which either first 'inCaseOfTrue' or second 'elseCase' parameter value is set as
|
||||
* the 'result' property.
|
||||
*
|
||||
* This can be understood as the:
|
||||
*
|
||||
* condition ? inCaseOfTrue : elseCase
|
||||
*
|
||||
* java syntax equivalent.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public class ConditionalPropertySetter implements Process {
|
||||
|
||||
public static final String PARAM_CONDITION = "condition";
|
||||
public static final String PARAM_INCASEOFTRUE = "inCaseOfTrue";
|
||||
public static final String PARAM_ELSECASE = "elseCase";
|
||||
|
||||
public static final String OUTPUT_PROPERTY_RESULT = "result";
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getInputPorts() {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getOutputPorts() {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(PortBindings portBindings, Configuration conf,
|
||||
Map<String, String> parameters) throws Exception {
|
||||
|
||||
String condition = parameters.get(PARAM_CONDITION);
|
||||
if (condition == null) {
|
||||
throw new RuntimeException("unable to make decision: " +
|
||||
PARAM_CONDITION + " parameter was not set!");
|
||||
}
|
||||
|
||||
Properties props = new Properties();
|
||||
props.setProperty(OUTPUT_PROPERTY_RESULT,
|
||||
Boolean.parseBoolean(condition)?
|
||||
parameters.get(PARAM_INCASEOFTRUE):
|
||||
parameters.get(PARAM_ELSECASE));
|
||||
OutputStream os = new FileOutputStream(
|
||||
new File(System.getProperty(OOZIE_ACTION_OUTPUT_FILENAME)));
|
||||
try {
|
||||
props.store(os, "");
|
||||
} finally {
|
||||
os.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,12 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.protobuf;
|
||||
|
||||
import com.google.protobuf.Message;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
|
||||
/**
|
||||
* @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
|
||||
*/
|
||||
public interface AvroToProtoBufConverter<IN extends IndexedRecord, OUT extends Message> {
|
||||
String convertIntoKey(IN datum);
|
||||
OUT convertIntoValue(IN datum);
|
||||
}
|
|
@ -1,62 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.protobuf;
|
||||
|
||||
import com.google.protobuf.Message;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.avro.mapred.AvroKey;
|
||||
import org.apache.hadoop.io.BytesWritable;
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
|
||||
*/
|
||||
public class AvroToProtoBufOneToOneMapper<IN extends IndexedRecord, OUT extends Message>
|
||||
extends Mapper<AvroKey<IN>, NullWritable, Text, BytesWritable> {
|
||||
private static final String CONVERTER_CLASS_PROPERTY = "converter_class";
|
||||
private static final Logger log = Logger.getLogger(AvroToProtoBufOneToOneMapper.class);
|
||||
|
||||
private final Text keyWritable = new Text();
|
||||
private final BytesWritable valueWritable = new BytesWritable();
|
||||
private AvroToProtoBufConverter<IN, OUT> converter;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public void setup(Context context) throws IOException, InterruptedException {
|
||||
Class<?> converterClass = context.getConfiguration().getClass(CONVERTER_CLASS_PROPERTY, null);
|
||||
|
||||
if (converterClass == null) {
|
||||
throw new IOException("Please specify " + CONVERTER_CLASS_PROPERTY);
|
||||
}
|
||||
|
||||
try {
|
||||
converter = (AvroToProtoBufConverter<IN, OUT>) converterClass.newInstance();
|
||||
} catch (ClassCastException e) {
|
||||
throw new IOException(
|
||||
"Class specified in " + CONVERTER_CLASS_PROPERTY + " doesn't implement AvroToProtoBufConverter", e);
|
||||
} catch (Exception e) {
|
||||
throw new IOException(
|
||||
"Could not instantiate specified AvroToProtoBufConverter class, " + converterClass, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void map(AvroKey<IN> avro, NullWritable ignore, Context context)
|
||||
throws IOException, InterruptedException {
|
||||
String key = null;
|
||||
try {
|
||||
key = converter.convertIntoKey(avro.datum());
|
||||
keyWritable.set(key);
|
||||
|
||||
byte[] value = converter.convertIntoValue(avro.datum()).toByteArray();
|
||||
valueWritable.set(value, 0, value.length);
|
||||
|
||||
context.write(keyWritable, valueWritable);
|
||||
} catch (Exception e) {
|
||||
log.error("Error" + (key != null ? " while processing " + key : ""), e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,32 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.report;
|
||||
|
||||
import eu.dnetlib.dhp.common.schemas.ReportEntry;
|
||||
import eu.dnetlib.dhp.common.schemas.ReportEntryType;
|
||||
|
||||
/**
|
||||
* Factory of {@link ReportEntry} objects.
|
||||
*
|
||||
* @author madryk
|
||||
*/
|
||||
public final class ReportEntryFactory {
|
||||
|
||||
// ----------------------- CONSTRUCTORS -----------------------------
|
||||
|
||||
private ReportEntryFactory() {}
|
||||
|
||||
// ----------------------- LOGIC ------------------------------------
|
||||
|
||||
/**
|
||||
* Creates {@link ReportEntry} with {@link ReportEntryType#COUNTER} type
|
||||
*/
|
||||
public static ReportEntry createCounterReportEntry(String key, long count) {
|
||||
return new ReportEntry(key, ReportEntryType.COUNTER, String.valueOf(count));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@link ReportEntry} with {@link ReportEntryType#DURATION} type
|
||||
*/
|
||||
public static ReportEntry createDurationReportEntry(String key, long duration) {
|
||||
return new ReportEntry(key, ReportEntryType.DURATION, String.valueOf(duration));
|
||||
}
|
||||
}
|
|
@ -1,110 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.report;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.common.java.PortBindings;
|
||||
import eu.dnetlib.dhp.common.java.Process;
|
||||
import eu.dnetlib.dhp.common.java.io.DataStore;
|
||||
import eu.dnetlib.dhp.common.java.io.FileSystemPath;
|
||||
import eu.dnetlib.dhp.common.java.porttype.AvroPortType;
|
||||
import eu.dnetlib.dhp.common.java.porttype.PortType;
|
||||
import eu.dnetlib.dhp.common.schemas.ReportEntry;
|
||||
|
||||
/**
|
||||
* Java workflow node process for building report.<br/>
|
||||
* It writes report properties into avro datastore of {@link ReportEntry}s
|
||||
* with location specified in output port.<br/>
|
||||
* Report property name must start with <code>report.</code> to
|
||||
* be included in output datastore.
|
||||
*
|
||||
* Usage example:<br/>
|
||||
* <pre>
|
||||
* {@code
|
||||
* <action name="report">
|
||||
* <java>
|
||||
* <main-class>eu.dnetlib.dhp.common.java.ProcessWrapper</main-class>
|
||||
* <arg>eu.dnetlib.dhp.common.report.ReportGenerator</arg>
|
||||
* <arg>-Preport.someProperty=someValue</arg>
|
||||
* <arg>-Oreport=/report/path</arg>
|
||||
* </java>
|
||||
* ...
|
||||
* </action>
|
||||
* }
|
||||
* </pre>
|
||||
* Above example will produce avro datastore in <code>/report/path</code>
|
||||
* with single {@link ReportEntry}.
|
||||
* Where the {@link ReportEntry#getKey()} will be equal to <code>someProperty</code> and
|
||||
* the {@link ReportEntry#getValue()} will be equal to <code>someValue</code>
|
||||
* (notice the stripped <code>report.</code> prefix from the entry key).
|
||||
*
|
||||
*
|
||||
* @author madryk
|
||||
*
|
||||
*/
|
||||
public class ReportGenerator implements Process {
|
||||
|
||||
private static final String REPORT_PORT_OUT_NAME = "report";
|
||||
|
||||
private static final String REPORT_PROPERTY_PREFIX = "report.";
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getInputPorts() {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getOutputPorts() {
|
||||
return Collections.singletonMap(REPORT_PORT_OUT_NAME, new AvroPortType(ReportEntry.SCHEMA$));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(PortBindings portBindings, Configuration conf, Map<String, String> parameters) throws Exception {
|
||||
|
||||
Map<String, String> entriesToReport = collectEntriesToReport(parameters);
|
||||
|
||||
List<ReportEntry> avroReport = convertToAvroReport(entriesToReport);
|
||||
|
||||
|
||||
FileSystem fs = FileSystem.get(conf);
|
||||
|
||||
Path reportPath = portBindings.getOutput().get(REPORT_PORT_OUT_NAME);
|
||||
|
||||
DataStore.create(avroReport, new FileSystemPath(fs, reportPath));
|
||||
|
||||
}
|
||||
|
||||
|
||||
//------------------------ PRIVATE --------------------------
|
||||
|
||||
private Map<String, String> collectEntriesToReport(Map<String, String> parameters) {
|
||||
|
||||
return parameters.entrySet().stream()
|
||||
.filter(property -> property.getKey().startsWith(REPORT_PROPERTY_PREFIX))
|
||||
.map(x -> Pair.of(x.getKey().substring(REPORT_PROPERTY_PREFIX.length()), x.getValue()))
|
||||
.collect(Collectors.toMap(e -> e.getLeft(), e -> e.getRight()));
|
||||
|
||||
}
|
||||
|
||||
private List<ReportEntry> convertToAvroReport(Map<String, String> entriesToReport) {
|
||||
|
||||
List<ReportEntry> avroReport = Lists.newArrayList();
|
||||
entriesToReport.forEach((key, value) -> avroReport.add(ReportEntryFactory.createCounterReportEntry(key, Long.valueOf(value))));
|
||||
|
||||
return avroReport;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.spark.pipe;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.mapred.AvroKey;
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.spark.SparkFiles;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import eu.dnetlib.dhp.common.utils.AvroGsonFactory;
|
||||
import scala.Tuple2;
|
||||
|
||||
|
||||
/**
|
||||
* Executor of mapreduce scripts using spark pipes.
|
||||
* It imitates hadoop streaming behavior.
|
||||
*
|
||||
* @author madryk
|
||||
*
|
||||
*/
|
||||
public class SparkPipeExecutor implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Imitates map part of hadoop streaming job.
|
||||
* It executes provided script for every key in inputRecords rdd.
|
||||
* <br/><br/>
|
||||
* It is assumed that provided script will read records from standard input (one line for one record)
|
||||
* and write mapped record into standard output (also one line for one record).
|
||||
* Mapped record can be a key/value pair. In that case script should return key and value
|
||||
* splitted by tab (\t) character in single line.
|
||||
*/
|
||||
public JavaPairRDD<String, String> doMap(JavaPairRDD<AvroKey<GenericRecord>, NullWritable> inputRecords, String scriptName, String args) {
|
||||
|
||||
JavaRDD<String> mappedRecords = inputRecords.keys().pipe("python " + SparkFiles.get(scriptName) + " " + args);
|
||||
|
||||
JavaPairRDD<String, String> outputRecords = mappedRecords
|
||||
.mapToPair(line -> {
|
||||
String[] splittedPair = line.split("\t");
|
||||
return new Tuple2<String, String>(splittedPair[0], (splittedPair.length == 1) ? null : splittedPair[1]);
|
||||
});
|
||||
|
||||
return outputRecords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Imitates reduce part of hadoop streaming job.
|
||||
* <br/><br/>
|
||||
* It is assumed that provided script will read records from standard input (one line for one record)
|
||||
* and group records with the same key into single record (reduce).
|
||||
* Method assures that all input records with the same key will be transfered in adjacent lines.
|
||||
* Reduced records should be written by script into standard output (one line for one record).
|
||||
* Reduced records must be json strings of class provided as argument.
|
||||
*/
|
||||
public JavaPairRDD<AvroKey<GenericRecord>, NullWritable> doReduce(JavaPairRDD<String, String> inputRecords, String scriptName, String args, Class<? extends GenericRecord> outputClass) {
|
||||
|
||||
JavaRDD<String> reducedRecords = inputRecords.sortByKey()
|
||||
.map(record -> record._1 + ((record._2 == null) ? "" : ("\t" + record._2)))
|
||||
.pipe("python " + SparkFiles.get(scriptName) + " " + args);
|
||||
|
||||
JavaPairRDD<AvroKey<GenericRecord>, NullWritable> outputRecords = reducedRecords
|
||||
.map(recordString -> AvroGsonFactory.create().fromJson(recordString, outputClass))
|
||||
.mapToPair(record -> new Tuple2<AvroKey<GenericRecord>, NullWritable>(new AvroKey<>(record), NullWritable.get()));
|
||||
|
||||
return outputRecords;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.string;
|
||||
|
||||
/**
|
||||
* Operations on {@link CharSequence}
|
||||
*
|
||||
* @author Łukasz Dumiszewski
|
||||
*/
|
||||
|
||||
public final class CharSequenceUtils {
|
||||
|
||||
|
||||
//------------------------ CONSTRUCTORS --------------------------
|
||||
|
||||
private CharSequenceUtils() {
|
||||
throw new IllegalStateException("may not be initialized");
|
||||
}
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Converts the given {@link CharSequence} <code>value</code> to {@link String} by using {@link CharSequence#toString()}.
|
||||
* Returns empty string if <code>value</code> is null.
|
||||
*/
|
||||
public static String toStringWithNullToEmpty(CharSequence value) {
|
||||
|
||||
return value == null? "": value.toString();
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,113 +0,0 @@
|
|||
/*
|
||||
* This file is part of CoAnSys project.
|
||||
* Copyright (c) 2012-2015 ICM-UW
|
||||
*
|
||||
* CoAnSys is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
|
||||
* CoAnSys is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package eu.dnetlib.dhp.common.string;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Mapping to the basic Latin alphabet (a-z, A-Z). In most cases, a character is
|
||||
* mapped to the closest visual form, rather than functional one, e.g.: "ö" is
|
||||
* mapped to "o" rather than "oe", and "đ" is mapped to "d" rather than "dj" or
|
||||
* "gj". Notable exceptions include: "ĸ" mapped to "q", "ß" mapped to "ss", and
|
||||
* "Þ", "þ" mapped to "Y", "y".
|
||||
*
|
||||
* <p> Each character is processed as follows: <ol> <li>the character is
|
||||
* compatibility decomposed,</li> <li>all the combining marks are removed,</li>
|
||||
* <li>the character is compatibility composed,</li> <li>additional "manual"
|
||||
* substitutions are applied.</li> </ol> </p>
|
||||
*
|
||||
* <p> All the characters from the "Latin-1 Supplement" and "Latin Extended-A"
|
||||
* Unicode blocks are mapped to the "Basic Latin" block. Characters from other
|
||||
* alphabets are generally left intact, although the decomposable ones may be
|
||||
* affected by the procedure. </p>
|
||||
*
|
||||
* @author Lukasz Bolikowski (bolo@icm.edu.pl)
|
||||
*
|
||||
* @author Łukasz Dumiszewski /just copied from coansys-commons/
|
||||
*
|
||||
*/
|
||||
public final class DiacriticsRemover {
|
||||
|
||||
private static final Character[] from = {
|
||||
'Æ', 'Ð', 'Ø', 'Þ', 'ß', 'æ', 'ð', 'ø', 'þ', 'Đ', 'đ', 'Ħ',
|
||||
'ħ', 'ı', 'ĸ', 'Ł', 'ł', 'Ŋ', 'ŋ', 'Œ', 'œ', 'Ŧ', 'ŧ'};
|
||||
private static final String[] to = {
|
||||
"AE", "D", "O", "Y", "ss", "ae", "d", "o", "y", "D", "d", "H",
|
||||
"h", "i", "q", "L", "l", "N", "n", "OE", "oe", "T", "t"};
|
||||
|
||||
private static Map<Character, String> lookup = buildLookup();
|
||||
|
||||
|
||||
//------------------------ CONSTRUCTORS -------------------
|
||||
|
||||
|
||||
private DiacriticsRemover() {}
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
|
||||
/**
|
||||
* Removes diacritics from a text.
|
||||
*
|
||||
* @param text Text to process.
|
||||
* @return Text without diacritics.
|
||||
*/
|
||||
public static String removeDiacritics(String text) {
|
||||
if (text == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String tmp = Normalizer.normalize(text, Normalizer.Form.NFKD);
|
||||
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int i = 0; i < tmp.length(); i++) {
|
||||
Character ch = tmp.charAt(i);
|
||||
if (Character.getType(ch) == Character.NON_SPACING_MARK) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (lookup.containsKey(ch)) {
|
||||
builder.append(lookup.get(ch));
|
||||
} else {
|
||||
builder.append(ch);
|
||||
}
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
|
||||
//------------------------ PRIVATE --------------------------
|
||||
|
||||
private static Map<Character, String> buildLookup() {
|
||||
if (from.length != to.length) {
|
||||
throw new IllegalStateException();
|
||||
}
|
||||
|
||||
Map<Character, String> _lookup = new HashMap<Character, String>();
|
||||
for (int i = 0; i < from.length; i++) {
|
||||
_lookup.put(from[i], to[i]);
|
||||
}
|
||||
|
||||
return _lookup;
|
||||
}
|
||||
}
|
|
@ -1,130 +0,0 @@
|
|||
/*
|
||||
* This file is part of CoAnSys project.
|
||||
* Copyright (c) 2012-2015 ICM-UW
|
||||
*
|
||||
* CoAnSys is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
|
||||
* CoAnSys is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package eu.dnetlib.dhp.common.string;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
/**
|
||||
* An implementation of {@link StringNormalizer} that normalizes strings for non-strict comparisons
|
||||
* in which one does not care about characters other than letters and digits or about differently written diacritics.
|
||||
*
|
||||
* @author Łukasz Dumiszewski
|
||||
*
|
||||
*/
|
||||
public final class LenientComparisonStringNormalizer implements StringNormalizer, Serializable {
|
||||
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
|
||||
private List<Character> whitelistCharacters;
|
||||
|
||||
|
||||
//------------------------ CONSTRUCTORS --------------------------
|
||||
|
||||
public LenientComparisonStringNormalizer() {
|
||||
this(ImmutableList.of());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param whitelistCharacters - non alphanumeric characters that will not be removed
|
||||
* during normalization
|
||||
*/
|
||||
public LenientComparisonStringNormalizer(List<Character> whitelistCharacters) {
|
||||
this.whitelistCharacters = whitelistCharacters;
|
||||
}
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Normalizes the given value. <br/>
|
||||
* The normalized strings are better suited for non-strict comparisons, in which one does NOT care about characters that are
|
||||
* neither letters nor digits; about accidental spaces or different diacritics etc. <br/><br/>
|
||||
* This method:
|
||||
* <ul>
|
||||
* <li>Replaces all characters that are not letters or digits with spaces (except those on whitelist characters list)</li>
|
||||
* <li>Replaces white spaces with spaces </li>
|
||||
* <li>Trims</li>
|
||||
* <li>Compacts multi-space gaps to one-space gaps</li>
|
||||
* <li>Removes diacritics</li>
|
||||
* <li>Changes characters to lower case</li>
|
||||
* </ul>
|
||||
* Returns "" if the passed value is null or blank
|
||||
*
|
||||
* @param value the string to normalize
|
||||
* @see DiacriticsRemover#removeDiacritics(String, boolean)
|
||||
*
|
||||
*
|
||||
*/
|
||||
public String normalize(String value) {
|
||||
|
||||
if (StringUtils.isBlank(value)) {
|
||||
|
||||
return "";
|
||||
|
||||
}
|
||||
|
||||
|
||||
String result = value;
|
||||
|
||||
result = DiacriticsRemover.removeDiacritics(result);
|
||||
|
||||
result = removeNonLetterDigitCharacters(result);
|
||||
|
||||
result = result.toLowerCase();
|
||||
|
||||
result = result.trim().replaceAll(" +", " ");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//------------------------ PRIVATE --------------------------
|
||||
|
||||
|
||||
private String removeNonLetterDigitCharacters(final String value) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < value.length(); ++i) {
|
||||
|
||||
char c = value.charAt(i);
|
||||
|
||||
if (Character.isLetterOrDigit(c) || whitelistCharacters.contains(c)) {
|
||||
sb.append(c);
|
||||
} else {
|
||||
sb.append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -1,16 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.string;
|
||||
|
||||
/**
|
||||
* String normalizer.
|
||||
*
|
||||
* @author Łukasz Dumiszewski
|
||||
*
|
||||
*/
|
||||
public interface StringNormalizer {
|
||||
|
||||
/**
|
||||
* Normalizes the given string value.
|
||||
*/
|
||||
String normalize(String value);
|
||||
|
||||
}
|
|
@ -1,45 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.utils;
|
||||
|
||||
import java.lang.reflect.Type;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.gson.JsonDeserializationContext;
|
||||
import com.google.gson.JsonDeserializer;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonParseException;
|
||||
|
||||
/**
|
||||
* Factory for gson object that supports serializing avro generated classes
|
||||
*
|
||||
* @author madryk
|
||||
*
|
||||
*/
|
||||
public final class AvroGsonFactory {
|
||||
|
||||
//------------------------ CONSTRUCTORS -------------------
|
||||
|
||||
|
||||
private AvroGsonFactory() {}
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
public static Gson create() {
|
||||
GsonBuilder builder = new GsonBuilder();
|
||||
|
||||
builder.registerTypeAdapter(CharSequence.class, new CharSequenceDeserializer());
|
||||
|
||||
return builder.create();
|
||||
}
|
||||
|
||||
public static class CharSequenceDeserializer implements JsonDeserializer<CharSequence> {
|
||||
|
||||
@Override
|
||||
public CharSequence deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context)
|
||||
throws JsonParseException {
|
||||
return json.getAsString();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,77 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.utils;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Mateusz Kobos
|
||||
*
|
||||
*/
|
||||
public final class AvroUtils {
|
||||
|
||||
public final static String primitiveTypePrefix = "org.apache.avro.Schema.Type.";
|
||||
|
||||
|
||||
//------------------------ CONSTRUCTORS -------------------
|
||||
|
||||
|
||||
private AvroUtils() {}
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
|
||||
/**
|
||||
* For a given name of a class generated from Avro schema return
|
||||
* a JSON schema.
|
||||
*
|
||||
* Apart from a name of a class you can also give a name of one of enums
|
||||
* defined in {@link org.apache.avro.Schema.Type}; in such case an
|
||||
* appropriate primitive type will be returned.
|
||||
*
|
||||
* @param typeName fully qualified name of a class generated from Avro schema,
|
||||
* e.g. {@code eu.dnetlib.dhp.common.avro.Person},
|
||||
* or a fully qualified name of enum defined by
|
||||
* {@link org.apache.avro.Schema.Type},
|
||||
* e.g. {@link org.apache.avro.Schema.Type.STRING}.
|
||||
* @return JSON string
|
||||
*/
|
||||
public static Schema toSchema(String typeName) {
|
||||
Schema schema = null;
|
||||
if(typeName.startsWith(primitiveTypePrefix)){
|
||||
String shortName = typeName.substring(
|
||||
primitiveTypePrefix.length(), typeName.length());
|
||||
schema = getPrimitiveTypeSchema(shortName);
|
||||
} else {
|
||||
schema = getAvroClassSchema(typeName);
|
||||
}
|
||||
return schema;
|
||||
}
|
||||
|
||||
private static Schema getPrimitiveTypeSchema(String shortName){
|
||||
Schema.Type type = Schema.Type.valueOf(shortName);
|
||||
return Schema.create(type);
|
||||
}
|
||||
|
||||
private static Schema getAvroClassSchema(String className){
|
||||
try {
|
||||
Class<?> avroClass = Class.forName(className);
|
||||
Field f = avroClass.getDeclaredField("SCHEMA$");
|
||||
return (Schema) f.get(null);
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new RuntimeException(
|
||||
"Class \""+className+"\" does not exist", e);
|
||||
} catch (SecurityException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (NoSuchFieldException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,45 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.utils;
|
||||
|
||||
/**
|
||||
* Byte array utility class.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public final class ByteArrayUtils {
|
||||
|
||||
//------------------------ CONSTRUCTORS -------------------
|
||||
|
||||
private ByteArrayUtils() {}
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Does this byte array begin with match array content?
|
||||
* @param source Byte array to examine
|
||||
* @param match Byte array to locate in <code>source</code>
|
||||
* @return true If the starting bytes are equal
|
||||
*/
|
||||
public static boolean startsWith(byte[] source, byte[] match) {
|
||||
return startsWith(source, 0, match);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this byte array begin with match array content?
|
||||
* @param source Byte array to examine
|
||||
* @param offset An offset into the <code>source</code> array
|
||||
* @param match Byte array to locate in <code>source</code>
|
||||
* @return true If the starting bytes are equal
|
||||
*/
|
||||
public static boolean startsWith(byte[] source, int offset, byte[] match) {
|
||||
if (match.length > (source.length - offset)) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < match.length; i++) {
|
||||
if (source[offset + i] != match[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,89 +0,0 @@
|
|||
package eu.dnetlib.dhp.common.utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.security.InvalidParameterException;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import eu.dnetlib.dhp.common.java.PortBindings;
|
||||
import eu.dnetlib.dhp.common.java.Ports;
|
||||
import eu.dnetlib.dhp.common.java.Process;
|
||||
import eu.dnetlib.dhp.common.java.io.CloseableIterator;
|
||||
import eu.dnetlib.dhp.common.java.io.DataStore;
|
||||
import eu.dnetlib.dhp.common.java.io.FileSystemPath;
|
||||
import eu.dnetlib.dhp.common.java.porttype.AnyPortType;
|
||||
import eu.dnetlib.dhp.common.java.porttype.PortType;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import static eu.dnetlib.dhp.common.WorkflowRuntimeParameters.OOZIE_ACTION_OUTPUT_FILENAME;
|
||||
|
||||
/**
|
||||
* Simple process verifying whether given datastore is empty.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public class EmptyDatastoreVerifierProcess implements Process {
|
||||
|
||||
public static final String INPUT_PORT_NAME = "input";
|
||||
|
||||
public static final String DEFAULT_ENCODING = "UTF-8";
|
||||
|
||||
public static final String OUTPUT_PROPERTY_IS_EMPTY = "isEmpty";
|
||||
|
||||
/**
|
||||
* Ports handled by this module.
|
||||
*/
|
||||
private final Ports ports;
|
||||
|
||||
|
||||
// ------------------------ CONSTRUCTORS --------------------------
|
||||
|
||||
public EmptyDatastoreVerifierProcess() {
|
||||
// preparing ports
|
||||
Map<String, PortType> input = new HashMap<String, PortType>();
|
||||
input.put(INPUT_PORT_NAME, new AnyPortType());
|
||||
Map<String, PortType> output = Collections.emptyMap();
|
||||
ports = new Ports(input, output);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getInputPorts() {
|
||||
return ports.getInput();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, PortType> getOutputPorts() {
|
||||
return ports.getOutput();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(PortBindings portBindings, Configuration conf, Map<String, String> parameters) throws Exception {
|
||||
if (!portBindings.getInput().containsKey(INPUT_PORT_NAME)) {
|
||||
throw new InvalidParameterException("missing input port!");
|
||||
}
|
||||
|
||||
try (CloseableIterator<?> closeableIt = getIterator(conf, portBindings.getInput().get(INPUT_PORT_NAME))) {
|
||||
File file = new File(System.getProperty(OOZIE_ACTION_OUTPUT_FILENAME));
|
||||
Properties props = new Properties();
|
||||
props.setProperty(OUTPUT_PROPERTY_IS_EMPTY, Boolean.toString(!closeableIt.hasNext()));
|
||||
try (OutputStream os = new FileOutputStream(file)) {
|
||||
props.store(os, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns iterator over datastore.
|
||||
*/
|
||||
protected CloseableIterator<?> getIterator(Configuration conf, Path path) throws IOException {
|
||||
return DataStore.getReader(new FileSystemPath(FileSystem.get(conf), path));
|
||||
}
|
||||
|
||||
}
|
|
@ -13,50 +13,8 @@
|
|||
<artifactId>dhp-schemas</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<!-- Plugin that generates Java classes from Avro schemas -->
|
||||
<plugin>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>schema</goal>
|
||||
<goal>idl-protocol</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<!-- This plugin makes the Maven->Update Project Configuration
|
||||
not forget about the "target/generated-sources/avro" source path-->
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>add-source</id>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>${project.build.directory}/generated-sources/avro/</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
|
||||
</project>
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
@namespace("eu.dnetlib.dhp.audit.schemas")
|
||||
protocol DHP {
|
||||
|
||||
record Cause {
|
||||
// generic cause code, root exception class name when derived from exception
|
||||
string code;
|
||||
// cause message
|
||||
union { null , string } message = null;
|
||||
}
|
||||
|
||||
record Fault {
|
||||
// input object identifier
|
||||
string inputObjectId;
|
||||
// fault creation timestamp
|
||||
long timestamp;
|
||||
// generic fault code, root exception class name when derived from exception
|
||||
string code;
|
||||
// fault message
|
||||
union { null , string } message = null;
|
||||
// stack trace
|
||||
union { null , string } stackTrace = null;
|
||||
// fault causes, array is indexed with cause depth
|
||||
union { null , array<Cause> } causes = null;
|
||||
// Other supplementary data related to specific type of fault.
|
||||
// See parameters description in oozie workflow.xml documentation of modules
|
||||
// that use this structure for information what exactly can be stored as supplementary data.
|
||||
union { null , map<string> } supplementaryData = null;
|
||||
}
|
||||
}
|
|
@ -1,16 +0,0 @@
|
|||
@namespace("eu.dnetlib.dhp.common.schemas")
|
||||
protocol DHP{
|
||||
|
||||
enum ReportEntryType {
|
||||
COUNTER, DURATION
|
||||
}
|
||||
|
||||
|
||||
record ReportEntry {
|
||||
|
||||
string key;
|
||||
ReportEntryType type;
|
||||
string value;
|
||||
|
||||
}
|
||||
}
|
|
@ -1,21 +0,0 @@
|
|||
@namespace("eu.dnetlib.dhp.importer.schemas")
|
||||
protocol DHP {
|
||||
|
||||
enum RecordFormat {
|
||||
XML, JSON
|
||||
}
|
||||
|
||||
record ImportedRecord {
|
||||
|
||||
// record identifier
|
||||
string id;
|
||||
|
||||
RecordFormat format;
|
||||
|
||||
// format name (OAF, OAI_DC, Datacite, etc) for which there is a parser implementation
|
||||
string formatName;
|
||||
|
||||
// record body
|
||||
string body;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-spark-jobs</artifactId>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,79 @@
|
|||
package eu.dnetlib.collection;
|
||||
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import static org.apache.spark.sql.functions.array_contains;
|
||||
|
||||
public class GenerateNativeStoreSparkJob {
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName("GenerateNativeStoreSparkJob")
|
||||
.master("local[*]")
|
||||
.getOrCreate();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaPairRDD<IntWritable, Text> f = sc.sequenceFile("/home/sandro/Downloads/mdstore_oai", IntWritable.class, Text.class);
|
||||
|
||||
String first = f.map(a -> a._2().toString()).first();
|
||||
|
||||
|
||||
final List<StructField> fields = new ArrayList<>();
|
||||
|
||||
fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
|
||||
fields.add(DataTypes.createStructField("format", DataTypes.StringType, false));
|
||||
fields.add(DataTypes.createStructField("formatName", DataTypes.StringType, true));
|
||||
fields.add(DataTypes.createStructField("body", DataTypes.StringType, true));
|
||||
|
||||
JavaRDD<Row> mdRdd = f.map((Function<Tuple2<IntWritable, Text>, Row>) item -> RowFactory.create("" + item._1().get(), "xml", null, item._2().toString()));
|
||||
|
||||
final StructType schema = DataTypes.createStructType(fields);
|
||||
Dataset<Row> ds = spark.createDataFrame(mdRdd, schema);
|
||||
|
||||
// ds.write().save("/home/sandro/Downloads/test.parquet");
|
||||
|
||||
Publication p2 = new Publication();
|
||||
p2.setDates(Collections.singletonList("2018-09-09"));
|
||||
p2.setTitles(Collections.singletonList("Titolo 2"));
|
||||
p2.setIdentifiers(Collections.singletonList(new PID("pmID", "1234567")));
|
||||
|
||||
Publication p1 = new Publication();
|
||||
p1.setDates(Collections.singletonList("2018-09-09"));
|
||||
p1.setTitles(Collections.singletonList("Titolo 1"));
|
||||
p1.setIdentifiers(Collections.singletonList(new PID("doi", "1234567")));
|
||||
|
||||
|
||||
|
||||
|
||||
Encoder<Publication> encoder = Encoders.bean(Publication.class);
|
||||
|
||||
Dataset<Publication> dp = spark.createDataset(Arrays.asList(p1,p2), encoder);
|
||||
|
||||
|
||||
long count = dp.where(array_contains(new Column("identifiers.schema"), "doi")).count();
|
||||
|
||||
System.out.println("count = " + count);
|
||||
|
||||
System.out.println(ds.count());
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -1,105 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-wf</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-wf-import</artifactId>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- required after introducing 'provided' scope for hadoop libs -->
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.googlecode.json-simple</groupId>
|
||||
<artifactId>json-simple</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-objectstore-rmi</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-rmi-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-resultset-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
|
||||
</dependency>
|
||||
<!-- proper spring context version required by cnr-resultset-client -->
|
||||
<dependency>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-context</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.cxf</groupId>
|
||||
<artifactId>cxf-rt-frontend-jaxws</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.code.gson</groupId>
|
||||
<artifactId>gson</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.10</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.10</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.databricks</groupId>
|
||||
<artifactId>spark-avro_2.10</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mongodb.spark</groupId>
|
||||
<artifactId>mongo-spark-connector_2.10</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -1,29 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.avro.file.DataFileWriter;
|
||||
|
||||
/**
|
||||
* {@link DataFileWriter} based record receiver.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public class DataFileRecordReceiver<T> implements RecordReceiver<T> {
|
||||
|
||||
private final DataFileWriter<T> writer;
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
* @param writer
|
||||
*/
|
||||
public DataFileRecordReceiver(DataFileWriter<T> writer) {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void receive(T object) throws IOException {
|
||||
this.writer.append(object);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.avro.file.DataFileWriter;
|
||||
|
||||
/**
|
||||
* {@link DataFileWriter} based record receiver with counter of
|
||||
* received records.
|
||||
*
|
||||
* @author madryk
|
||||
*/
|
||||
public class DataFileRecordReceiverWithCounter<T> extends DataFileRecordReceiver<T> {
|
||||
|
||||
private long receivedCount = 0L;
|
||||
|
||||
|
||||
//------------------------ CONSTRUCTORS --------------------------
|
||||
|
||||
/**
|
||||
* Default constructor
|
||||
*
|
||||
* @param writer - writer of the received records
|
||||
*/
|
||||
public DataFileRecordReceiverWithCounter(DataFileWriter<T> writer) {
|
||||
super(writer);
|
||||
}
|
||||
|
||||
|
||||
//------------------------ GETTERS --------------------------
|
||||
|
||||
/**
|
||||
* Returns number of received records
|
||||
*/
|
||||
public long getReceivedCount() {
|
||||
return receivedCount;
|
||||
}
|
||||
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Receives passed record and increments the counter.
|
||||
*/
|
||||
@Override
|
||||
public void receive(T record) throws IOException {
|
||||
super.receive(record);
|
||||
++receivedCount;
|
||||
}
|
||||
}
|
|
@ -1,52 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer;
|
||||
|
||||
/**
|
||||
* Import realated workflow parameters.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public final class ImportWorkflowRuntimeParameters {
|
||||
|
||||
// parameter names
|
||||
|
||||
public static final String IMPORT_INFERENCE_PROVENANCE_BLACKLIST = "import.inference.provenance.blacklist";
|
||||
public static final String IMPORT_SKIP_DELETED_BY_INFERENCE = "import.skip.deleted.by.inference";
|
||||
public static final String IMPORT_TRUST_LEVEL_THRESHOLD = "import.trust.level.threshold";
|
||||
public static final String IMPORT_APPROVED_DATASOURCES_CSV = "import.approved.datasources.csv";
|
||||
public static final String IMPORT_APPROVED_COLUMNFAMILIES_CSV = "import.approved.columnfamilies.csv";
|
||||
public static final String IMPORT_MERGE_BODY_WITH_UPDATES = "import.merge.body.with.updates";
|
||||
public static final String IMPORT_CONTENT_APPROVED_OBJECSTORES_CSV = "import.content.approved.objectstores.csv";
|
||||
public static final String IMPORT_CONTENT_BLACKLISTED_OBJECSTORES_CSV = "import.content.blacklisted.objectstores.csv";
|
||||
|
||||
public static final String IMPORT_CONTENT_OBJECT_STORE_LOC = "import.content.object.store.location";
|
||||
public static final String IMPORT_CONTENT_OBJECT_STORE_IDS_CSV = "import.content.object.store.ids.csv";
|
||||
public static final String IMPORT_CONTENT_MAX_FILE_SIZE_MB = "import.content.max.file.size.mb";
|
||||
public static final String IMPORT_CONTENT_CONNECTION_TIMEOUT = "import.content.connection.timeout";
|
||||
public static final String IMPORT_CONTENT_READ_TIMEOUT = "import.content.read.timeout";
|
||||
|
||||
public static final String IMPORT_MDSTORE_IDS_CSV = "import.mdstore.ids.csv";
|
||||
public static final String IMPORT_MDSTORE_SERVICE_LOCATION = "import.mdstore.service.location";
|
||||
public static final String IMPORT_MDSTORE_RECORD_MAXLENGTH = "import.mdstore.record.maxlength";
|
||||
|
||||
public static final String IMPORT_ISLOOKUP_SERVICE_LOCATION = "import.islookup.service.location";
|
||||
public static final String IMPORT_VOCABULARY_CODE = "import.vocabulary.code";
|
||||
public static final String IMPORT_VOCABULARY_OUTPUT_FILENAME = "import.vocabulary.output.filename";
|
||||
|
||||
public static final String IMPORT_RESULT_SET_CLIENT_READ_TIMEOUT = "import.resultset.client.read.timeout";
|
||||
public static final String IMPORT_RESULT_SET_CLIENT_CONNECTION_TIMEOUT = "import.resultset.client.connection.timeout";
|
||||
public static final String IMPORT_RESULT_SET_PAGESIZE = "import.resultset.pagesize";
|
||||
|
||||
|
||||
public static final String HBASE_ENCODING = "hbase.table.encoding";
|
||||
|
||||
public static final String IMPORT_FACADE_FACTORY_CLASS = "import.facade.factory.classname";
|
||||
|
||||
// default values
|
||||
|
||||
public static final String RESULTSET_READ_TIMEOUT_DEFAULT_VALUE = "60000";
|
||||
public static final String RESULTSET_CONNECTION_TIMEOUT_DEFAULT_VALUE = "60000";
|
||||
public static final String RESULTSET_PAGESIZE_DEFAULT_VALUE = "100";
|
||||
|
||||
private ImportWorkflowRuntimeParameters() {}
|
||||
|
||||
}
|
|
@ -1,14 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Record receiver interface.
|
||||
* @author mhorst
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
public interface RecordReceiver<T> {
|
||||
|
||||
void receive(T object) throws IOException;
|
||||
}
|
|
@ -1,104 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer.facade;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import javax.xml.ws.BindingProvider;
|
||||
import javax.xml.ws.wsaddressing.W3CEndpointReferenceBuilder;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import eu.dnetlib.enabling.tools.JaxwsServiceResolverImpl;
|
||||
|
||||
/**
|
||||
* Abstract class utilized by all WebService facades.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public abstract class AbstractResultSetAwareWebServiceFacade<T> {
|
||||
|
||||
private final Logger log = Logger.getLogger(this.getClass());
|
||||
|
||||
/**
|
||||
* Web service.
|
||||
*/
|
||||
private final T service;
|
||||
|
||||
/**
|
||||
* ResultSet read timeout.
|
||||
*/
|
||||
private final long resultSetReadTimeout;
|
||||
|
||||
/**
|
||||
* ResultSet connection timeout.
|
||||
*/
|
||||
private final long resultSetConnectionTimeout;
|
||||
|
||||
/**
|
||||
* ResultSet page size.
|
||||
*/
|
||||
private final int resultSetPageSize;
|
||||
|
||||
|
||||
//------------------------ CONSTRUCTORS -------------------
|
||||
|
||||
/**
|
||||
* Instantiates underlying service.
|
||||
* @param clazz webservice class
|
||||
* @param serviceLocation webservice location
|
||||
* @param serviceReadTimeout service read timeout
|
||||
* @param serviceConnectionTimeout service connection timeout
|
||||
* @param resultSetReadTimeout resultset read timeout
|
||||
* @param resultSetConnectionTimeout resultset connection timeout
|
||||
* @param resultSetPageSize resultset page size
|
||||
*/
|
||||
protected AbstractResultSetAwareWebServiceFacade(Class<T> clazz, String serviceLocation,
|
||||
long serviceReadTimeout, long serviceConnectionTimeout,
|
||||
long resultSetReadTimeout, long resultSetConnectionTimeout, int resultSetPageSize) {
|
||||
W3CEndpointReferenceBuilder eprBuilder = new W3CEndpointReferenceBuilder();
|
||||
eprBuilder.address(serviceLocation);
|
||||
eprBuilder.build();
|
||||
this.service = new JaxwsServiceResolverImpl().getService(clazz, eprBuilder.build());
|
||||
if (this.service instanceof BindingProvider) {
|
||||
log.info(String.format("setting timeouts for %s: read timeout (%s) and connect timeout (%s)",
|
||||
BindingProvider.class, serviceReadTimeout, serviceConnectionTimeout));
|
||||
final Map<String, Object> requestContext = ((BindingProvider) service).getRequestContext();
|
||||
|
||||
// can't be sure about which will be used. Set them all.
|
||||
requestContext.put("com.sun.xml.internal.ws.request.timeout", serviceReadTimeout);
|
||||
requestContext.put("com.sun.xml.internal.ws.connect.timeout", serviceConnectionTimeout);
|
||||
|
||||
requestContext.put("com.sun.xml.ws.request.timeout", serviceReadTimeout);
|
||||
requestContext.put("com.sun.xml.ws.connect.timeout", serviceConnectionTimeout);
|
||||
|
||||
requestContext.put("javax.xml.ws.client.receiveTimeout", serviceReadTimeout);
|
||||
requestContext.put("javax.xml.ws.client.connectionTimeout", serviceConnectionTimeout);
|
||||
}
|
||||
|
||||
this.resultSetReadTimeout = resultSetReadTimeout;
|
||||
this.resultSetConnectionTimeout = resultSetConnectionTimeout;
|
||||
this.resultSetPageSize = resultSetPageSize;
|
||||
}
|
||||
|
||||
|
||||
//------------------------ GETTERS -------------------------
|
||||
|
||||
public T getService() {
|
||||
return service;
|
||||
}
|
||||
|
||||
|
||||
public long getResultSetReadTimeout() {
|
||||
return resultSetReadTimeout;
|
||||
}
|
||||
|
||||
|
||||
public long getResultSetConnectionTimeout() {
|
||||
return resultSetConnectionTimeout;
|
||||
}
|
||||
|
||||
|
||||
public int getResultSetPageSize() {
|
||||
return resultSetPageSize;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer.facade;
|
||||
|
||||
/**
|
||||
* ISLookup service facade.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public interface ISLookupFacade {
|
||||
|
||||
/**
|
||||
* Provides all profiles matching given query
|
||||
* @param xPathQuery XPath query
|
||||
*/
|
||||
Iterable<String> searchProfile(String xPathQuery) throws ServiceFacadeException;
|
||||
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer.facade;
|
||||
|
||||
/**
|
||||
* MDStore service facade.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public interface MDStoreFacade {
|
||||
|
||||
/**
|
||||
* Delivers all records for given MDStore identifier
|
||||
* @param mdStoreId MDStore identifier
|
||||
*/
|
||||
Iterable<String> deliverMDRecords(String mdStoreId) throws ServiceFacadeException;
|
||||
|
||||
}
|
|
@ -1,19 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer.facade;
|
||||
|
||||
/**
|
||||
* ObjectStore service facade.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public interface ObjectStoreFacade {
|
||||
|
||||
/**
|
||||
* Returns metadata records from given objectstore created in specified time range.
|
||||
* @param objectStoreId object store identifier
|
||||
* @param from from time in millis
|
||||
* @param until until time in millis
|
||||
*/
|
||||
Iterable<String> deliverObjects(String objectStoreId, long from, long until) throws ServiceFacadeException;
|
||||
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer.facade;
|
||||
|
||||
/**
|
||||
* Service facade generic exception.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public class ServiceFacadeException extends Exception {
|
||||
|
||||
private static final long serialVersionUID = 0L;
|
||||
|
||||
//------------------------ CONSTRUCTORS -------------------
|
||||
|
||||
public ServiceFacadeException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public ServiceFacadeException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public ServiceFacadeException(Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer.facade;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Generic service facade factory. All implementations must be instantiable with no-argument construtor.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public interface ServiceFacadeFactory<T> {
|
||||
|
||||
/**
|
||||
* Creates service of given type configured with parameters.
|
||||
*
|
||||
* @param parameters service configuration
|
||||
*
|
||||
*/
|
||||
T instantiate(Map<String, String> parameters);
|
||||
}
|
|
@ -1,80 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer.facade;
|
||||
|
||||
import static eu.dnetlib.dhp.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_FACADE_FACTORY_CLASS;
|
||||
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
|
||||
import eu.dnetlib.dhp.wf.importer.ImportWorkflowRuntimeParameters;
|
||||
|
||||
/**
|
||||
* Service facade utility methods.
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public final class ServiceFacadeUtils {
|
||||
|
||||
//------------------------ CONSTRUCTORS -------------------
|
||||
|
||||
private ServiceFacadeUtils() {}
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
/**
|
||||
* Instantiates service based on provided parameters.
|
||||
*
|
||||
* Service factory class name is mandatory and has to be provided as {@value ImportWorkflowRuntimeParameters#IMPORT_FACADE_FACTORY_CLASS} parameter.
|
||||
* Other parameters will be used by factory itself. Factory must be instantiable with no-argument construtor.
|
||||
*
|
||||
* @param parameters set of parameters required for service instantiation
|
||||
*
|
||||
*/
|
||||
public static <T> T instantiate(Map<String, String> parameters) throws ServiceFacadeException {
|
||||
String serviceFactoryClassName = parameters.get(IMPORT_FACADE_FACTORY_CLASS);
|
||||
if (StringUtils.isBlank(serviceFactoryClassName)) {
|
||||
throw new ServiceFacadeException("unknown service facade factory, no " + IMPORT_FACADE_FACTORY_CLASS + " parameter provided!");
|
||||
}
|
||||
try {
|
||||
Class<?> clazz = Class.forName(serviceFactoryClassName);
|
||||
Constructor<?> constructor = clazz.getConstructor();
|
||||
@SuppressWarnings("unchecked")
|
||||
ServiceFacadeFactory<T> serviceFactory = (ServiceFacadeFactory<T>) constructor.newInstance();
|
||||
return serviceFactory.instantiate(parameters);
|
||||
} catch (Exception e) {
|
||||
throw new ServiceFacadeException("exception occurred while instantiating service by facade factory: " + IMPORT_FACADE_FACTORY_CLASS, e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates service based on provided configuration.
|
||||
*
|
||||
* Service factory class name is mandatory and has to be provided as {@value ImportWorkflowRuntimeParameters#IMPORT_FACADE_FACTORY_CLASS} configuration entry.
|
||||
* Other parameters will be used by factory itself. Factory must be instantiable with no-argument construtor.
|
||||
*
|
||||
* @param config set of configuration entries required for service instantiation
|
||||
*/
|
||||
public static <T> T instantiate(Configuration config) throws ServiceFacadeException {
|
||||
return instantiate(buildParameters(config));
|
||||
}
|
||||
|
||||
|
||||
// ------------------------ PRIVATE --------------------------
|
||||
|
||||
/**
|
||||
* Converts configuration entries into plain map.
|
||||
*/
|
||||
private static Map<String, String> buildParameters(Configuration config) {
|
||||
ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
|
||||
for (Map.Entry<String, String> entry : config) {
|
||||
builder.put(entry);
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,55 +0,0 @@
|
|||
package eu.dnetlib.dhp.wf.importer.facade;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
/**
|
||||
* WebService based database facade.
|
||||
*
|
||||
* @author mhorst
|
||||
*
|
||||
*/
|
||||
public class WebServiceISLookupFacade extends AbstractResultSetAwareWebServiceFacade<ISLookUpService> implements ISLookupFacade {
|
||||
|
||||
private static final Logger log = Logger.getLogger(WebServiceISLookupFacade.class);
|
||||
|
||||
|
||||
//------------------------ CONSTRUCTORS -------------------
|
||||
|
||||
/**
|
||||
* @param serviceLocation database service location
|
||||
* @param serviceReadTimeout service read timeout
|
||||
* @param serviceConnectionTimeout service connection timeout
|
||||
* @param resultSetReadTimeout result set providing database results read timeout
|
||||
* @param resultSetConnectionTimeout result set connection timeout
|
||||
* @param resultSetPageSize result set data chunk size
|
||||
*/
|
||||
public WebServiceISLookupFacade(String serviceLocation,
|
||||
long serviceReadTimeout, long serviceConnectionTimeout,
|
||||
long resultSetReadTimeout, long resultSetConnectionTimeout, int resultSetPageSize) {
|
||||
super(ISLookUpService.class, serviceLocation,
|
||||
serviceReadTimeout, serviceConnectionTimeout,
|
||||
resultSetReadTimeout, resultSetConnectionTimeout, resultSetPageSize);
|
||||
}
|
||||
|
||||
//------------------------ LOGIC --------------------------
|
||||
|
||||
@Override
|
||||
public Iterable<String> searchProfile(String xPathQuery) throws ServiceFacadeException {
|
||||
try {
|
||||
return getService().quickSearchProfile(xPathQuery);
|
||||
} catch (ISLookUpDocumentNotFoundException e) {
|
||||
log.error("unable to find profile for query: " + xPathQuery, e);
|
||||
return Collections.emptyList();
|
||||
} catch (ISLookUpException e) {
|
||||
throw new ServiceFacadeException("searching profiles in ISLookup failed with query '" + xPathQuery + "'", e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue