added some example code
This commit is contained in:
parent
56569c223e
commit
310e44e3e4
|
@ -1,3 +1,6 @@
|
|||
.DS_Store
|
||||
.idea/
|
||||
|
||||
# ---> Java
|
||||
# Compiled class file
|
||||
*.class
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
import os
|
||||
import argparse
|
||||
import glob
|
||||
from pathlib import Path
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
regex = r"<\w*>(.*)<\/\w*>"
|
||||
|
||||
|
||||
def get_jar_path():
|
||||
for current_path in glob.glob("target/*.jar"):
|
||||
return current_path
|
||||
|
||||
|
||||
def extract_value(line):
|
||||
matches = re.finditer(regex, line, re.MULTILINE)
|
||||
for matchNum, match in enumerate(matches, start=1):
|
||||
for groupNum in range(0, len(match.groups())):
|
||||
return match.group(groupNum + 1)
|
||||
|
||||
|
||||
def extract_dependencies():
|
||||
with open("pom.xml") as f:
|
||||
check_dependency = False
|
||||
dep_to_add = []
|
||||
for line in f:
|
||||
if "<!-- JAR NEED -->" in line:
|
||||
check_dependency = True
|
||||
current_deps = {}
|
||||
elif check_dependency:
|
||||
if "groupId" in line:
|
||||
current_deps["groupId"] = extract_value(line)
|
||||
elif "artifactId" in line:
|
||||
current_deps["artifactId"] = extract_value(line)
|
||||
elif "version" in line:
|
||||
current_deps["version"] = extract_value(line)
|
||||
print("version", extract_value(line))
|
||||
elif "</dependency>" in line:
|
||||
dep_to_add.append(current_deps)
|
||||
check_dependency = False
|
||||
return dep_to_add
|
||||
|
||||
|
||||
def to_path(dependency):
|
||||
home_dir = Path.home()
|
||||
base_path = os.path.join(home_dir, ".m2/repository/")
|
||||
group_path = os.path.join(base_path, dependency["groupId"].replace(".", "/"))
|
||||
data_path = os.path.join(group_path, dependency["artifactId"])
|
||||
version_path = os.path.join(data_path, dependency["version"])
|
||||
current_path = os.path.join(version_path, "{}-{}.jar".format(dependency["artifactId"], dependency["version"]))
|
||||
if os.path.exists(current_path):
|
||||
return "{}-{}.jar".format(dependency["artifactId"], dependency["version"]), current_path
|
||||
return None,None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
deps = extract_dependencies()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="This scripts help you to publish, execute your check script in scala ")
|
||||
parser.add_argument("user_name")
|
||||
parser.add_argument("reference_class")
|
||||
parser.add_argument("argument_file")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"reference_class = {args.reference_class} argument_file={args.argument_file}")
|
||||
|
||||
if os.path.exists(args.argument_file):
|
||||
other_arguments = []
|
||||
with open(args.argument_file) as f:
|
||||
for line in f:
|
||||
if len(line.strip()):
|
||||
other_arguments.append(line.strip())
|
||||
print("Cleaning Compile application ")
|
||||
os.system('mvn clean compile package')
|
||||
main_jar_path = get_jar_path()
|
||||
if main_jar_path is None:
|
||||
raise Exception("Unable to find the jar")
|
||||
|
||||
print("copy on your root folder")
|
||||
|
||||
os.system("ssh {}@iis-cdh5-test-gw.ocean.icm.edu.pl rm -rf sandro_nb".format(args.user_name))
|
||||
|
||||
os.system("ssh {}@iis-cdh5-test-gw.ocean.icm.edu.pl mkdir sandro_nb".format(args.user_name))
|
||||
|
||||
os.system(f"scp {main_jar_path} {args.user_name}@iis-cdh5-test-gw.ocean.icm.edu.pl:sandro_nb/")
|
||||
jars = []
|
||||
for item in deps:
|
||||
name, p = to_path(item)
|
||||
if p:
|
||||
print(f"Copying dependencies {p} to lib deps")
|
||||
os.system(f"scp {p} {args.user_name}@iis-cdh5-test-gw.ocean.icm.edu.pl:sandro_nb/")
|
||||
jars.append(name)
|
||||
|
||||
j_name = ",".join(["sandro_nb/"+ item for item in jars])
|
||||
name = main_jar_path.replace("target/", "")
|
||||
command = f"spark2-submit --master yarn --jars {j_name} --class {args.reference_class} sandro_nb/{name}"
|
||||
print(f"executing command {command}")
|
||||
|
||||
os.system(
|
||||
"ssh {}@iis-cdh5-test-gw.ocean.icm.edu.pl {} {}".format(args.user_name, command, " ".join(other_arguments)))
|
||||
|
||||
else:
|
||||
raise Exception(f"path not found {args.argument_file}")
|
|
@ -0,0 +1,153 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.sandro</groupId>
|
||||
<artifactId>ZeppelinNotebook</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>${net.alchim31.maven.version}</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-doc</id>
|
||||
<phase>process-resources</phase> <!-- or wherever -->
|
||||
<goals>
|
||||
<goal>doc</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>dnet45-releases</id>
|
||||
<name>D-Net 45 releases</name>
|
||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
|
||||
<layout>default</layout>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>dnet45-snapshots</id>
|
||||
<name>D-Net 45 snapshots</name>
|
||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
|
||||
<layout>default</layout>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
<releases>
|
||||
<enabled>false</enabled>
|
||||
</releases>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>dnet45-bootstrap-snapshot</id>
|
||||
<name>D-Net 45 Bootstrap Snapshot</name>
|
||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/</url>
|
||||
<releases>
|
||||
<enabled>false</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
<layout>default</layout>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>dnet45-bootstrap-release</id>
|
||||
<name>D-Net 45 Bootstrap Release</name>
|
||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
<layout>default</layout>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>cloudera</id>
|
||||
<name>Cloudera Repository</name>
|
||||
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
|
||||
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-core</artifactId>
|
||||
<version>${dhp.jackson.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
<version>${dhp.spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
<version>${dhp.spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<!-- JAR NEED -->
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>3.15.0</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
<properties>
|
||||
<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
|
||||
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
||||
<scala.version>2.11.12</scala.version>
|
||||
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
</properties>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,52 @@
|
|||
package eu.dnetlib.scholix
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
||||
|
||||
|
||||
|
||||
|
||||
object CheckRelation {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName)
|
||||
|
||||
|
||||
def countRelation(path:String, spark: SparkSession ): Unit = {
|
||||
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
import spark.implicits._
|
||||
val df = spark.read.text(path).as[String]
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val scholix_rel =df.map(s=> mapper.readValue(s, classOf[Relation])).as[Relation].filter(r => r.getDataInfo.getDeletedbyinference == false).count()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
logger.warn(s"Total number of relations: ${df.count}")
|
||||
logger.warn(s"Total number of relations not deleted by Inference: ${scholix_rel}")
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val path = args(0)
|
||||
val master = args(1)
|
||||
val conf:SparkConf = new SparkConf()
|
||||
val spark = SparkSession.builder().config(conf).master(master).getOrCreate()
|
||||
spark.sparkContext.setLogLevel("WARN")
|
||||
countRelation(path,spark)
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue