added some example code

This commit is contained in:
Sandro La Bruzzo 2022-10-18 15:08:06 +02:00
parent 56569c223e
commit 310e44e3e4
4 changed files with 313 additions and 0 deletions

3
.gitignore vendored
View File

@ -1,3 +1,6 @@
.DS_Store
.idea/
# ---> Java
# Compiled class file
*.class

105
execute_notebook.py Normal file
View File

@ -0,0 +1,105 @@
import os
import argparse
import glob
from pathlib import Path
import re
import subprocess
regex = r"<\w*>(.*)<\/\w*>"
def get_jar_path():
for current_path in glob.glob("target/*.jar"):
return current_path
def extract_value(line):
matches = re.finditer(regex, line, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
for groupNum in range(0, len(match.groups())):
return match.group(groupNum + 1)
def extract_dependencies():
with open("pom.xml") as f:
check_dependency = False
dep_to_add = []
for line in f:
if "<!-- JAR NEED -->" in line:
check_dependency = True
current_deps = {}
elif check_dependency:
if "groupId" in line:
current_deps["groupId"] = extract_value(line)
elif "artifactId" in line:
current_deps["artifactId"] = extract_value(line)
elif "version" in line:
current_deps["version"] = extract_value(line)
print("version", extract_value(line))
elif "</dependency>" in line:
dep_to_add.append(current_deps)
check_dependency = False
return dep_to_add
def to_path(dependency):
home_dir = Path.home()
base_path = os.path.join(home_dir, ".m2/repository/")
group_path = os.path.join(base_path, dependency["groupId"].replace(".", "/"))
data_path = os.path.join(group_path, dependency["artifactId"])
version_path = os.path.join(data_path, dependency["version"])
current_path = os.path.join(version_path, "{}-{}.jar".format(dependency["artifactId"], dependency["version"]))
if os.path.exists(current_path):
return "{}-{}.jar".format(dependency["artifactId"], dependency["version"]), current_path
return None,None
if __name__ == "__main__":
deps = extract_dependencies()
parser = argparse.ArgumentParser(
description="This scripts help you to publish, execute your check script in scala ")
parser.add_argument("user_name")
parser.add_argument("reference_class")
parser.add_argument("argument_file")
args = parser.parse_args()
print(f"reference_class = {args.reference_class} argument_file={args.argument_file}")
if os.path.exists(args.argument_file):
other_arguments = []
with open(args.argument_file) as f:
for line in f:
if len(line.strip()):
other_arguments.append(line.strip())
print("Cleaning Compile application ")
os.system('mvn clean compile package')
main_jar_path = get_jar_path()
if main_jar_path is None:
raise Exception("Unable to find the jar")
print("copy on your root folder")
os.system("ssh {}@iis-cdh5-test-gw.ocean.icm.edu.pl rm -rf sandro_nb".format(args.user_name))
os.system("ssh {}@iis-cdh5-test-gw.ocean.icm.edu.pl mkdir sandro_nb".format(args.user_name))
os.system(f"scp {main_jar_path} {args.user_name}@iis-cdh5-test-gw.ocean.icm.edu.pl:sandro_nb/")
jars = []
for item in deps:
name, p = to_path(item)
if p:
print(f"Copying dependencies {p} to lib deps")
os.system(f"scp {p} {args.user_name}@iis-cdh5-test-gw.ocean.icm.edu.pl:sandro_nb/")
jars.append(name)
j_name = ",".join(["sandro_nb/"+ item for item in jars])
name = main_jar_path.replace("target/", "")
command = f"spark2-submit --master yarn --jars {j_name} --class {args.reference_class} sandro_nb/{name}"
print(f"executing command {command}")
os.system(
"ssh {}@iis-cdh5-test-gw.ocean.icm.edu.pl {} {}".format(args.user_name, command, " ".join(other_arguments)))
else:
raise Exception(f"path not found {args.argument_file}")

153
pom.xml Normal file
View File

@ -0,0 +1,153 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.sandro</groupId>
<artifactId>ZeppelinNotebook</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>dnet45-releases</id>
<name>D-Net 45 releases</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout>
<snapshots>
<enabled>false</enabled>
</snapshots>
<releases>
<enabled>true</enabled>
</releases>
</repository>
<repository>
<id>dnet45-snapshots</id>
<name>D-Net 45 snapshots</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
<layout>default</layout>
<snapshots>
<enabled>true</enabled>
</snapshots>
<releases>
<enabled>false</enabled>
</releases>
</repository>
<repository>
<id>dnet45-bootstrap-snapshot</id>
<name>D-Net 45 Bootstrap Snapshot</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/</url>
<releases>
<enabled>false</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
<layout>default</layout>
</repository>
<repository>
<id>dnet45-bootstrap-release</id>
<name>D-Net 45 Bootstrap Release</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
<layout>default</layout>
</repository>
<repository>
<id>cloudera</id>
<name>Cloudera Repository</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${dhp.jackson.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${dhp.spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${dhp.spark.version}</version>
<scope>provided</scope>
</dependency>
<!-- JAR NEED -->
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>3.15.0</version>
</dependency>
</dependencies>
<properties>
<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
<dhp.jackson.version>2.9.6</dhp.jackson.version>
<scala.version>2.11.12</scala.version>
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
</project>

View File

@ -0,0 +1,52 @@
package eu.dnetlib.scholix
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.schema.oaf.Relation
object CheckRelation {
val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName)
def countRelation(path:String, spark: SparkSession ): Unit = {
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val df = spark.read.text(path).as[String]
val mapper = new ObjectMapper()
val scholix_rel =df.map(s=> mapper.readValue(s, classOf[Relation])).as[Relation].filter(r => r.getDataInfo.getDeletedbyinference == false).count()
logger.warn(s"Total number of relations: ${df.count}")
logger.warn(s"Total number of relations not deleted by Inference: ${scholix_rel}")
}
def main(args: Array[String]): Unit = {
val path = args(0)
val master = args(1)
val conf:SparkConf = new SparkConf()
val spark = SparkSession.builder().config(conf).master(master).getOrCreate()
spark.sparkContext.setLogLevel("WARN")
countRelation(path,spark)
}
}