diff --git a/dnet-and-test/job-override.properties b/dnet-and-test/job-override.properties index e69de29..7b18a45 100644 --- a/dnet-and-test/job-override.properties +++ b/dnet-and-test/job-override.properties @@ -0,0 +1,10 @@ +workingPath = /tmp/lda_working_dir +numPartitions = 1000 +entitiesPath = /tmp/publications_with_pid_pubmed +inputFieldJPath = $.description[0].value +vocabularyPath = /tmp/lda_working_dir/essential_science_vocabulary +vocabularyType = file +trainRatio = 0.8 +numTopics = 5,10,15,20,25,30,35,40,45,50 +maxIterations = 200 +outputModelPath = /tmp/lda_working_dir/bestLdaModel diff --git a/dnet-and-test/pom.xml b/dnet-and-test/pom.xml index b8df727..636c2ff 100644 --- a/dnet-and-test/pom.xml +++ b/dnet-and-test/pom.xml @@ -1,20 +1,570 @@ - + + 4.0.0 + eu.dnetlib dnet-and 1.0.0-SNAPSHOT + ../pom.xml dnet-and-test + jar - - 8 - 8 - UTF-8 - + + + + org.apache.maven.plugins + maven-deploy-plugin + 2.7 + + true + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + **/*.java + + + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + + + + + + edu.cmu + secondstring + + + + eu.dnetlib + dnet-feature-extraction + ${project.version} + + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-graphx_2.11 + + + org.apache.spark + spark-sql_2.11 + + + org.apache.spark + spark-mllib_2.11 + + + + org.junit.jupiter + junit-jupiter + test + + + + com.fasterxml.jackson.dataformat + jackson-dataformat-xml + + + com.fasterxml.jackson.module + jackson-module-jsonSchema + + + com.fasterxml.jackson.core + jackson-databind + + + com.fasterxml.jackson.core + jackson-core + + + + org.scala-lang + scala-library + + + + com.jayway.jsonpath + json-path + + + + org.mockito + mockito-core + test + + + org.mockito + mockito-junit-jupiter + test + + + + + + + oozie-package + + + + org.apache.maven.plugins + maven-enforcer-plugin + 1.4.1 + + + enforce-connection-properties-file-existence + initialize + + enforce + + + + + + ${dhpConnectionProperties} + + + The file with connection properties could not be found. Please, create the ${dhpConnectionProperties} file or set the location to another already created file by using + -DdhpConnectionProperties property. + + + + true + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy dependencies + prepare-package + + copy-dependencies + + + ${oozie.package.dependencies.include.scope} + ${oozie.package.dependencies.exclude.scope} + true + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + attach-test-resources-package + prepare-package + + test-jar + + + ${oozie.package.skip.test.jar} + + + + + + + eu.dnetlib.primer + primer-maven-plugin + 1.2.0 + + + priming + prepare-package + + prime + + + + ${project.build.directory}/dependency/*.jar + ${project.build.directory}/*-tests.jar + ${project.build.directory}/classes + + ${project.build.directory}/dependency + ${project.build.directory}/${primed.dir} + ${workflow.source.dir} + + + + + + + org.kuali.maven.plugins + properties-maven-plugin + ${properties.maven.plugin.version} + + + eu.dnetlib + dhp-build-assembly-resources + ${project.version} + + + + + + reading-dhp-properties + initialize + + read-project-properties + + + + ${dhpConnectionProperties} + + false + + + + read-default-properties + prepare-package + + read-project-properties + + + + classpath:project-default.properties + + true + + + + read-job-properties + prepare-package + + read-project-properties + + + + ${project.build.directory}/${primed.dir}/job.properties + job-override.properties + + true + + + + + + eu.dnetlib + dhp-build-properties-maven-plugin + ${project.version} + + + validate + + generate-properties + + + + + + + write-job-properties + prepare-package + + write-project-properties + + + target/${oozie.package.file.name}/job.properties + + nameNode,jobTracker,queueName,importerQueueName,oozieLauncherQueueName, + workingDir,oozieTopWfApplicationPath,oozieServiceLoc, + sparkDriverMemory,sparkExecutorMemory,sparkExecutorCores, + oozie.wf.application.path,projectVersion,oozie.use.system.libpath, + oozieActionShareLibForSpark1,spark1YarnHistoryServerAddress,spark1EventLogDir, + oozieActionShareLibForSpark2,spark2YarnHistoryServerAddress,spark2EventLogDir, + sparkSqlWarehouseDir + + true + + + ${project.build.directory}/${primed.dir}/job.properties + job-override.properties + + + + + + + + pl.project13.maven + git-commit-id-plugin + 2.1.11 + + + + revision + + + + + true + yyyy-MM-dd'T'HH:mm:ssZ + true + target/${oozie.package.file.name}/${oozieAppDir}/version.properties + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.0.0 + + + eu.dnetlib + dhp-build-assembly-resources + ${project.version} + + + + + assembly-oozie-installer + package + + single + + + false + ${oozie.package.file.name}_shell_scripts + + oozie-installer + + + + + + + + + + + maven-antrun-plugin + + + + installer-copy-custom + process-resources + + run + + + + + + + + + + + package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + run + + + + + + + + + deploy + + + + org.codehaus.mojo + exec-maven-plugin + 1.5.0 + + + create-target-dir + package + + exec + + + ssh + + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name} + -p ${dhp.hadoop.frontend.port.ssh} + -o StrictHostKeyChecking=no + rm -rf ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; mkdir -p ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/ + + + + + upload-oozie-package + package + + exec + + + scp + + -P ${dhp.hadoop.frontend.port.ssh} + -o StrictHostKeyChecking=no + target/${oozie.package.file.name}.tar.gz + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}:${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/${oozie.package.file.name}.tar.gz + + + + + extract-and-upload-to-hdfs + package + + exec + + + ssh + + + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name} + -p ${dhp.hadoop.frontend.port.ssh} + -o StrictHostKeyChecking=no + cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; + tar -zxf oozie-package.tar.gz; + rm ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/oozie-package.tar.gz; + ./upload_workflow.sh + + + + + + + + + + run + + + + org.codehaus.mojo + exec-maven-plugin + 1.5.0 + + + run-job + package + + exec + + + ssh + + ${oozie.execution.log.file.location} + + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name} + -p ${dhp.hadoop.frontend.port.ssh} + -o StrictHostKeyChecking=no + cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; + ./run_workflow.sh + + + + + show-run-log-on-stdout + package + + exec + + + cat + + ${oozie.execution.log.file.location} + + + + + + + + + \ No newline at end of file diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java index 193a70a..f9c18ad 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java @@ -2,12 +2,9 @@ package eu.dnetlib.jobs; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.support.ArgumentApplicationParser; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.SparkConf; @@ -49,12 +46,6 @@ public abstract class AbstractSparkJob implements Serializable { dataset.write().option("compression", "gzip").mode(mode).json(outPath); } - protected static DedupConfig loadDedupConfig(String dedupConfPath) throws IOException { - return DedupConfig.load( - readFileFromHDFS(dedupConfPath) - ); - } - protected static String readFileFromHDFS(String filePath) throws IOException { Path path=new Path(filePath); diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java index 4776600..53ef3e1 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java @@ -1,2 +1,71 @@ -package eu.dnetlib.jobs;public class SparkCountVectorizer { +package eu.dnetlib.jobs; + +import eu.dnetlib.featureextraction.FeatureTransformer; +import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.hadoop.fs.shell.Count; +import org.apache.spark.SparkConf; +import org.apache.spark.ml.feature.CountVectorizerModel; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Optional; + +public class SparkCountVectorizer extends AbstractSparkJob{ + + private static final Logger log = LoggerFactory.getLogger(SparkCountVectorizer.class); + + public SparkCountVectorizer(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + readResource("/jobs/parameters/countVectorizer_parameters.json", SparkTokenizer.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkCountVectorizer( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() throws IOException { + + // read oozie parameters + final String workingPath = parser.get("workingPath"); + final String vocabularyPath = parser.get("vocabularyPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("workingPath: '{}'", workingPath); + log.info("vocabularyPath: '{}'", vocabularyPath); + log.info("numPartitions: '{}'", numPartitions); + + //read input tokens + Dataset inputTokensDS = spark.read().load(workingPath + "/tokens").repartition(numPartitions); + //read vocabulary + CountVectorizerModel vocabulary = FeatureTransformer.loadVocabulary(vocabularyPath); + + Dataset countVectorizedData = FeatureTransformer.countVectorizeData(inputTokensDS, vocabulary); + + countVectorizedData + .write() + .mode(SaveMode.Overwrite) + .save(workingPath + "/countVectorized"); + + } + } diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java index 9758eee..b56c52e 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java @@ -1,2 +1,94 @@ -package eu.dnetlib.jobs;public class SparkCreateVocabulary { +package eu.dnetlib.jobs; + +import eu.dnetlib.featureextraction.FeatureTransformer; +import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.spark.SparkConf; +import org.apache.spark.ml.feature.CountVectorizerModel; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Paths; +import java.util.Optional; + +public class SparkCreateVocabulary extends AbstractSparkJob{ + + final static int VOCAB_SIZE = 1<<18; + final static double MIN_DF = 0.1; + final static double MIN_TF = 1; + + private static final Logger log = LoggerFactory.getLogger(SparkCreateVocabulary.class); + + public SparkCreateVocabulary(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + readResource("/jobs/parameters/createVocabulary_parameters.json", SparkTokenizer.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkCreateVocabulary( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() throws IOException { + + // read oozie parameters + final String workingPath = parser.get("workingPath"); + final String vocabularyPath = parser.get("vocabularyPath"); + final String vocabularyType = parser.get("vocabularyType"); //from file or from tokens + final double minDF = Optional + .ofNullable(parser.get("minDF")) + .map(Double::valueOf) + .orElse(MIN_DF); + final double minTF = Optional + .ofNullable(parser.get("minTF")) + .map(Double::valueOf) + .orElse(MIN_TF); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + final int vocabSize = Optional + .ofNullable(parser.get("vocabSize")) + .map(Integer::valueOf) + .orElse(VOCAB_SIZE); + + log.info("workingPath: '{}'", workingPath); + log.info("vocabularyPath: '{}'", vocabularyPath); + log.info("vocabularyType: '{}'", vocabularyType); + log.info("minDF: '{}'", minDF); + log.info("minTF: '{}'", minTF); + log.info("vocabSize: '{}'", vocabSize); + + Dataset inputTokensDS = spark.read().load(workingPath + "/tokens").repartition(numPartitions); + CountVectorizerModel vocabulary; + if (vocabularyType.equals("file")) { + try { + vocabulary = FeatureTransformer.createVocabularyFromFile(Paths + .get(getClass().getResource("/eu/dnetlib/jobs/support/vocabulary_words.txt").toURI()) + .toFile() + .getAbsolutePath()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + else { + vocabulary = FeatureTransformer.createVocabularyFromTokens(inputTokensDS, minDF, minTF, vocabSize); + } + + vocabulary.write().overwrite().save(vocabularyPath); + } + } diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDATuning.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDATuning.java index a90d243..9a4e5b8 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDATuning.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDATuning.java @@ -1,11 +1,90 @@ package eu.dnetlib.jobs; -import java.io.IOException; +import eu.dnetlib.featureextraction.FeatureTransformer; +import eu.dnetlib.featureextraction.util.Utilities; +import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.spark.SparkConf; +import org.apache.spark.ml.clustering.LDAModel; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; -public class SparkLDAHyperparametersTuning extends AbstractSparkJob{ +import java.io.IOException; +import java.util.*; +import java.util.stream.Stream; + +public class SparkLDATuning extends AbstractSparkJob{ + + private static final Logger log = LoggerFactory.getLogger(SparkLDATuning.class); + + public SparkLDATuning(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + readResource("/jobs/parameters/ldaTuning_parameters.json", SparkTokenizer.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkLDATuning( + parser, + getSparkSession(conf) + ).run(); + } @Override - void run() throws IOException { + public void run() throws IOException { + + // read oozie parameters + final String workingPath = parser.get("workingPath"); + final int maxIterations = Integer.parseInt(parser.get("maxIterations")); + final double trainRatio = Double.parseDouble(parser.get("trainRatio")); + int[] numTopics = Arrays.stream(parser.get("numTopics").split(",")).mapToInt(s -> Integer.parseInt(s)).toArray(); + final String outputModelPath = parser.get("outputModelPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("workingPath: '{}'", workingPath); + log.info("numPartitions: '{}'", numPartitions); + log.info("maxIterations: '{}'", maxIterations); + log.info("numTopics: '{}'", numTopics.toString()); + log.info("trainRatio: '{}'", trainRatio); + log.info("outputModelPath: '{}'", outputModelPath); + + Dataset inputFeaturesDS = spark.read().load(workingPath + "/countVectorized"); + Map> ldaModels = + FeatureTransformer.ldaTuning(inputFeaturesDS, trainRatio, numTopics, maxIterations); + + double bestPerplexity = 100L; + LDAModel bestModel = null; + + List stats = new ArrayList<>(); + stats.add("k,perplexity,path"); + for(Integer k: ldaModels.keySet()) { + //save LDAModel + ldaModels.get(k)._1().write().overwrite().save(workingPath + "/lda_model_k" + k); + //prepare line + stats.add(k + "," + ldaModels.get(k)._2() + "," + workingPath + "/lda_model_k" + k); + + //pick the best model + bestModel = (ldaModels.get(k)._2() <= bestPerplexity)? ldaModels.get(k)._1() : bestModel; + bestPerplexity = Math.min(ldaModels.get(k)._2(), bestPerplexity); + } + + bestModel.write().overwrite().save(outputModelPath); + Utilities.writeLinesToHDFSFile(stats, workingPath + "/perplexity_stats.csv"); } + } + diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkTokenizer.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkTokenizer.java index 7e5fc83..f60b187 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkTokenizer.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkTokenizer.java @@ -1,2 +1,69 @@ -package eu.dnetlib.jobs;public class SparkTokenizer { +package eu.dnetlib.jobs; + +import eu.dnetlib.featureextraction.FeatureTransformer; +import eu.dnetlib.featureextraction.util.Utilities; +import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.ws.rs.core.Feature; +import java.io.IOException; +import java.util.Optional; + +public class SparkTokenizer extends AbstractSparkJob { + + private static final Logger log = LoggerFactory.getLogger(SparkTokenizer.class); + + public SparkTokenizer(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + readResource("/jobs/parameters/tokenizer_parameters.json", SparkTokenizer.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkTokenizer( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() throws IOException { + // read oozie parameters + final String entitiesPath = parser.get("entitiesPath"); + final String workingPath = parser.get("workingPath"); + final String inputFieldJPath = parser.get("inputFieldJPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("entitiesPath: '{}'", entitiesPath); + log.info("workingPath: '{}'", workingPath); + log.info("inputField: '{}'", inputFieldJPath); + log.info("numPartitions: '{}'", numPartitions); + + JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); + SQLContext sqlContext = SQLContext.getOrCreate(spark.sparkContext()); + + Dataset inputDS = Utilities.prepareDataset(sqlContext, context.textFile(entitiesPath).repartition(numPartitions), inputFieldJPath); + + Dataset tokensDS = FeatureTransformer.tokenizeData(inputDS); + + tokensDS + .write() + .mode(SaveMode.Overwrite) + .save(workingPath + "/tokens"); + } } diff --git a/dnet-and-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java b/dnet-and-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java index e2dae4a..d2eb00e 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java @@ -68,7 +68,7 @@ public class ArgumentApplicationParser implements Serializable { GZIPOutputStream gzip = new GZIPOutputStream(out); gzip.write(value.getBytes()); gzip.close(); - return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); + return Base64.getEncoder().encodeToString(out.toByteArray()); } public void parseArgument(final String[] args) throws Exception { diff --git a/dnet-and-test/src/main/resources/eu/dnetlib/jobs/support/vocabulary_words.txt b/dnet-and-test/src/main/resources/eu/dnetlib/jobs/support/vocabulary_words.txt index e69de29..6ec7812 100644 --- a/dnet-and-test/src/main/resources/eu/dnetlib/jobs/support/vocabulary_words.txt +++ b/dnet-and-test/src/main/resources/eu/dnetlib/jobs/support/vocabulary_words.txt @@ -0,0 +1,436 @@ +hypothesis +experiment +control +model +graph +precision +accuracy +assumption +procedure +observation +inference +method +framework +data +prediction +quantitative +qualitative +bias +constant +variable +chart +trend +empirical +evidence +cell +chromosome +cellulose +chloroplast +cytoplasm +diffusion +lysosome +meiosis +membrane +mitochondrion +mitosis +nucleolus +nucleus +organelle +osmosis +permeable +photosynthesis +respiration +ribosome +vacuole +amphibian +arthropod +bacteria +cold-blooded +domain +eukaryote +family +fungus +genus +invertebrate +kingdom +mammal +order +phylum +prokaryote +reptile +vertebrate +virus +blood +warm-blooded +annual +bulb +chlorophyll +germinate +germination +leaf +perennial +phloem +phototropism +pollen +pollinate +root +seed +stamen +stoma +transpiration +xylem +circulation +digestion +digestive +endocrine +excretion +homeostasis +hormone +immune +immunize +infection +lymphatic +metabolism +nervous +nutrition +pathogen +reproduction +respiratory +vaccination +aorta +artery +brain +capillary +cardiac +cartilage +cerebellum +cerebrum +cranium +epidermis +esophagus +femur +gland +hemoglobin +involuntary +joint +ligament +muscle +nerve +neuron +organ +plasma +platelet +skeleton +sternum +synapse +tendon +tissue +vein +ventricle +vertebra +voluntary +autotrophic +biome +camouflage +carnivore +commensalism +community +competition +consumer +decomposer +habitat +herbivore +heterotrophic +host +interdependent +migration +mutualism +niche +nutrient +omnivore +organism +parasite +parasitism +population +predator +prey +producer +scavenger +succession +symbiosis +adaptation +allele +clone +dominant +extinction +gene +genome +genotype +heredity +heterozygous +homologous +homozygous +hybrid +inherit +mutation +natural +selection +offspring +phenotype +probability +recessive +species +trait +variation +altitude +core +crust +deposition +elevation +epoch +equator +era +erosion +fossil +geology +hydrosphere +igneous +lithosphere +mantle +metamorphic +paleontology +petrifaction +prehistoric +sedimentary +sedimentation +stratum +tide +aftershock +canyon +continent +continental +drift +desert +dormant +earthquake +epicenter +eruption +fault +geyser +glacier +iceberg +lava +magma +molten +plate +tectonics +plateau +ridge +rift +savanna +seismic +seismograph +subduction +tundra +volcano +watershed +barometer +blizzard +climate +change +condensation +convection +current +cyclone +desertification +drought +evaporation +front +humidity +hurricane +meteorology +monsoon +precipitation +pressure +sleet +temperature +thermometer +tornado +tropical +tsunami +weather +aquifer +biodegradable +biodiversity +biomass +biosphere +conservation +decay +deforestation +depletion +ecology +extraction +fission +fuel +fracking +geothermal +global +warming +irrigation +landfill +mineral +resource +ozone +pesticide +petroleum +pollutant +pollution +reclamation +recycle +renewable +reservoir +salinity +sustainable +turbine +apogee +asteroid +astronomy +atmosphere +axis +constellation +comet +corona +eclipse +elliptical +galaxy +luminosity +lunar +meteor +meteorite +nadir +nebula +observatory +orbit +perigee +pulsar +quasar +solar +stellar +supernova +vacuum +wane +zenith +alloy +anion +atom +bond +cation +compound +density +ductile +electron +element +gas +ion +isotope +liquid +malleable +mass +metal +metalloid +molecule +neutron +nonmetal +polar +proton +solid +substance +volume +acid +base +catalyst +concentration +dissolve +enzyme +oxidation +precipitate +reactant +reaction +saturate +solubility +solute +solution +solvent +substrate +synthesis +conduction +endothermic +energy +entropy +equilibrium +exothermic +heat +insulation +matter +nuclear +thermal +acceleration +axle +centripetal +deceleration +force +friction +fulcrum +gravity +inclined +inertia +kinetic +lever +machine +momentum +motion +potential +power +pulley +screw +speed +tensile +torque +velocity +wedge +amplitude +circuit +compression +crest +diffraction +emission +frequency +magnet +medium +particle +period +pole +radiation +rarefaction +reflection +refraction +spectrum +trough +ultraviolet +wavelength +x-ray +ray +biology +taxonomy +plant +ecosystem +genetic +evolution +geologic +feature +environment +space +physic +wave +electricity +magnetism \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/jobs/parameters/countVectorizer_parameters.json b/dnet-and-test/src/main/resources/jobs/parameters/countVectorizer_parameters.json index e69de29..37017a7 100644 --- a/dnet-and-test/src/main/resources/jobs/parameters/countVectorizer_parameters.json +++ b/dnet-and-test/src/main/resources/jobs/parameters/countVectorizer_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "location of the working directory", + "paramRequired": true + }, + { + "paramName": "v", + "paramLongName": "vocabularyPath", + "paramDescription": "location to store the vocabulary", + "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/jobs/parameters/createVocabulary_parameters.json b/dnet-and-test/src/main/resources/jobs/parameters/createVocabulary_parameters.json index e69de29..7882a2b 100644 --- a/dnet-and-test/src/main/resources/jobs/parameters/createVocabulary_parameters.json +++ b/dnet-and-test/src/main/resources/jobs/parameters/createVocabulary_parameters.json @@ -0,0 +1,44 @@ +[ + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "location of the working directory", + "paramRequired": true + }, + { + "paramName": "v", + "paramLongName": "vocabularyPath", + "paramDescription": "location to store the vocabulary", + "paramRequired": true + }, + { + "paramName": "vt", + "paramLongName": "vocabularyType", + "paramDescription": "type of vocabulary: it could ben 'tokens' if generated with tokens or 'file' if generated from file of words", + "paramRequired": true + }, + { + "paramName": "md", + "paramLongName": "minDF", + "paramDescription": "specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer greater than or equal to 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents", + "paramRequired": false + }, + { + "paramName": "mt", + "paramLongName": "minTF", + "paramDescription": "filter to ignore rare words in a document. For each document, terms with frequency/count less than the given threshold are ignored. If this is an integer greater than or equal to 1, then this specifies a count (of times the term must appear in the document); if this is a double in [0,1), then this specifies a fraction (out of the document's token count)", + "paramRequired": false + }, + { + "paramName": "s", + "paramLongName": "vocabSize", + "paramDescription": "size of the vocabulary", + "paramRequired": false + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/jobs/parameters/ldaTuning_parameters.json b/dnet-and-test/src/main/resources/jobs/parameters/ldaTuning_parameters.json index e69de29..7d45eac 100644 --- a/dnet-and-test/src/main/resources/jobs/parameters/ldaTuning_parameters.json +++ b/dnet-and-test/src/main/resources/jobs/parameters/ldaTuning_parameters.json @@ -0,0 +1,38 @@ +[ + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", + "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false + }, + { + "paramName": "tr", + "paramLongName": "trainRatio", + "paramDescription": "dataset percentage to be used as training set, the remaining part is the test set", + "paramRequired": true + }, + { + "paramName": "nt", + "paramLongName": "numTopics", + "paramDescription": "comma separated number of topics to tune the model", + "paramRequired": true + }, + { + "paramName": "mi", + "paramLongName": "maxIterations", + "paramDescription": "maximum number of iteration of the algorithm", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputModelPath", + "paramDescription": "best model in terms of perplexity", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/jobs/parameters/tokenizer_parameters.json b/dnet-and-test/src/main/resources/jobs/parameters/tokenizer_parameters.json index e69de29..23ecd45 100644 --- a/dnet-and-test/src/main/resources/jobs/parameters/tokenizer_parameters.json +++ b/dnet-and-test/src/main/resources/jobs/parameters/tokenizer_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "i", + "paramLongName": "entitiesPath", + "paramDescription": "the input data: entities that should be tokenized", + "paramRequired": true + }, + { + "paramName": "f", + "paramLongName": "inputFieldJPath", + "paramDescription": "the jpath of the field to be tokenized", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", + "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml b/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml index a42086a..f943e92 100644 --- a/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml +++ b/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + entitiesPath @@ -13,8 +13,32 @@ number of partitions for the spark files - dedupConfPath - path for the dedup configuration file + inputFieldJPath + json path of the input field in the entities + + + vocabularyPath + location of the vocabulary + + + vocabularyType + type of the vocabulary: file or tokens + + + trainRatio + percentage of the data to be used as training set + + + numTopics + number of topics to which test the LDA model + + + maxIterations + maximum number of iterations of the LDA algorithm + + + outputModelPath + location of the best LDA model sparkDriverMemory @@ -81,22 +105,22 @@ - + - + yarn cluster - Create Similarity Relations - eu.dnetlib.jobs.SparkCreateSimRels - dnet-dedup-test-${projectVersion}.jar + Tokenize Data + eu.dnetlib.jobs.SparkTokenizer + dnet-and-test-${projectVersion}.jar --num-executors=32 - --executor-memory=12G - --executor-cores=4 - --driver-memory=4G + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -105,27 +129,26 @@ --conf spark.dynamicAllocation.enabled=false --entitiesPath${entitiesPath} + --inputFieldJPath${inputFieldJPath} --workingPath${workingPath} --numPartitions${numPartitions} - --dedupConfPath${dedupConfPath} - --useTree${useTree} - + - + yarn cluster - Create Merge Relations - eu.dnetlib.jobs.SparkCreateMergeRels - dnet-dedup-test-${projectVersion}.jar + Create Vocabulary + eu.dnetlib.jobs.SparkCreateVocabulary + dnet-and-test-${projectVersion}.jar --num-executors=32 - --executor-memory=12G - --executor-cores=4 - --driver-memory=4G + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -133,22 +156,22 @@ --conf spark.sql.shuffle.partitions=3840 --conf spark.dynamicAllocation.enabled=true - --entitiesPath${entitiesPath} + --vocabularyPath${vocabularyPath} + --vocabularyType${vocabularyType} --workingPath${workingPath} --numPartitions${numPartitions} - --dedupConfPath${dedupConfPath} - + - + yarn cluster Compute Statistics - eu.dnetlib.jobs.SparkComputeStatistics - dnet-dedup-test-${projectVersion}.jar + eu.dnetlib.jobs.SparkCountVectorizer + dnet-and-test-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -159,7 +182,36 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --entitiesPath${entitiesPath} + --vocabularyPath${vocabularyPath} + --workingPath${workingPath} + --numPartitions${numPartitions} + + + + + + + + yarn + cluster + LDA Tuning + eu.dnetlib.jobs.SparkLDATuning + dnet-and-test-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + + --trainRatio${trainRatio} + --numTopics${numTopics} + --maxIterations${maxIterations} + --outputModelPath${outputModelPath} --workingPath${workingPath} --numPartitions${numPartitions} @@ -167,31 +219,5 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java b/dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java index 665f83d..348928c 100644 --- a/dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java +++ b/dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java @@ -1,6 +1,7 @@ package eu.dnetlib.jobs; import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; @@ -8,6 +9,7 @@ import org.junit.jupiter.api.*; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.junit.jupiter.MockitoExtension; +import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Paths; @@ -15,11 +17,14 @@ import java.nio.file.Paths; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestInstance(TestInstance.Lifecycle.PER_CLASS) -public class SparkJobsTest { +public class LDAAnalysisTest { static SparkSession spark; static JavaSparkContext context; final static String workingPath = "/tmp/working_dir"; + final static String tokensPath = workingPath + "/tokens"; + final static String vocabularyPath = workingPath + "/vocabulary"; + final static String bestLDAModelPath = workingPath + "/bestLDAmodel"; final static String numPartitions = "20"; final String inputDataPath = Paths .get(getClass().getResource("/eu/dnetlib/jobs/examples/publications.subset.json").toURI()) @@ -27,12 +32,11 @@ public class SparkJobsTest { .getAbsolutePath(); final static String inputFieldJPath = "$.description[0].value"; - public SparkJobsTest() throws URISyntaxException {} + public LDAAnalysisTest() throws URISyntaxException {} public static void cleanup() throws IOException { //remove directories and clean workspace - //TODO add directories to be removed - //FileUtils.deleteDirectory(new File(path)); + FileUtils.deleteDirectory(new File(workingPath)); } @BeforeAll @@ -48,10 +52,10 @@ public class SparkJobsTest { context = JavaSparkContext.fromSparkContext(spark.sparkContext()); } - @AfterAll - public static void finalCleanUp() throws IOException { - cleanup(); - } +// @AfterAll +// public static void finalCleanUp() throws IOException { +// cleanup(); +// } @Test @Order(1) @@ -75,6 +79,69 @@ public class SparkJobsTest { } + @Test + @Order(2) + public void createVocabularyTest() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/createVocabulary_parameters.json", SparkTokenizer.class)); + + parser.parseArgument( + new String[] { + "-w", workingPath, + "-v", vocabularyPath, + "-vt", "file" + } + ); + + new SparkCreateVocabulary( + parser, + spark + ).run(); + } + + @Test + @Order(3) + public void countVectorizeTest() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/countVectorizer_parameters.json", SparkTokenizer.class)); + + parser.parseArgument( + new String[]{ + "-w", workingPath, + "-v", vocabularyPath, + "-np", numPartitions + } + ); + + new SparkCountVectorizer( + parser, + spark + ).run(); + + } + + @Test + @Order(4) + public void ldaTuningTest() throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/ldaTuning_parameters.json", SparkTokenizer.class)); + + parser.parseArgument( + new String[]{ + "-w", workingPath, + "-np", numPartitions, + "-tr", "0.8", + "-nt", "2,3,4,5", + "-mi", "5", + "-o", bestLDAModelPath + }); + + new SparkLDATuning( + parser, + spark + ).run(); + + } + public static String readResource(String path, Class clazz) throws IOException { return IOUtils.toString(clazz.getResourceAsStream(path)); } diff --git a/pom.xml b/pom.xml index 2e3f02f..63126ea 100644 --- a/pom.xml +++ b/pom.xml @@ -1,17 +1,447 @@ - + + 4.0.0 - org.example + eu.dnetlib dnet-and - 1.0-SNAPSHOT + 1.0.0-SNAPSHOT + + pom + + http://www.d-net.research-infrastructures.eu + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + + + + scm:git:https://code-repo.d4science.org/michele.debonis/dnet-and.git + dnet-and-1.0.0 + + + + dhp-build + dnet-feature-extraction + dnet-and-test + + + + Redmine + https://issue.openaire.research-infrastructures.eu/projects/openaire + + + + + dnet45-releases + D-Net 45 Releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases + default + + + + + + dnet-deps + dnet-dependencies + https://maven.d4science.org/nexus/content/repositories/dnet-deps + default + + + dnet45-releases + D-Net 45 Releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases + default + + true + + + + dnet45-snapshots + D-Net 45 Snapshots + https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots + default + + true + + + + + cloudera + Cloudera Repository + https://repository.cloudera.com/artifactory/cloudera-repos + + true + + + false + + + + + ceon + Ceon Repository + https://maven.ceon.pl/artifactory/repo + + true + + + false + + + + + target + target/classes + ${project.artifactId}-${project.version} + target/test-classes + + + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.0 + + 1.8 + 1.8 + ${project.build.sourceEncoding} + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + + + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + verify + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.19.1 + + + org.junit.jupiter + junit-jupiter + ${junit-jupiter.version} + + + + false + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + true + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.0.0 + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.13 + + + integration-test + + integration-test + + + + verify + + verify + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + + + + + + + iis-releases + iis releases plugin repository + http://maven.ceon.pl/artifactory/iis-releases + default + + - 8 - 8 + + oozie-package + src/test/resources/define/path/pointing/to/directory/holding/oozie_app + oozie_app + UTF-8 + UTF-8 + + 2.2.2 + 15.0 + + 2.2.0 + 2.5.5 + 2.6.5 + 3.3.3 + + 3.5 + 2.4 + 3.2.1 + 1.1.3 + + 4.9 + 2.11.8 + + false + 3.6.0 + + + default + default + default + primed + + runtime + + true + + ${user.home}/.dhp/application.properties + + ${maven.build.timestamp} + + ${project.version} + true + 2.0.1 + 5.6.1 + ../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-${project.version}.jar + - - \ No newline at end of file + + + + + edu.cmu + secondstring + 1.0.0 + + + org.antlr + stringtemplate + 3.2 + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + provided + + + com.fasterxml.jackson.dataformat + jackson-dataformat-xml + ${jackson.version} + provided + + + com.fasterxml.jackson.module + jackson-module-jsonSchema + ${jackson.version} + provided + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + provided + + + com.fasterxml.jackson.core + jackson-annotations + ${jackson.version} + provided + + + + org.mockito + mockito-core + ${mockito-core.version} + test + + + + org.mockito + mockito-junit-jupiter + ${mockito-core.version} + test + + + + org.apache.commons + commons-math3 + 3.6.1 + + + + com.google.guava + guava + ${google.guava.version} + + + com.google.code.gson + gson + ${google.gson.version} + + + + org.apache.commons + commons-lang3 + ${commons.lang.version} + + + + commons-io + commons-io + ${commons.io.version} + + + commons-collections + commons-collections + ${commons.collections.version} + + + commons-logging + commons-logging + ${commons.logging.version} + + + org.apache.spark + spark-core_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-graphx_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-sql_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-mllib_2.11 + ${spark.version} + provided + + + org.junit.jupiter + junit-jupiter + ${junit-jupiter.version} + test + + + + org.reflections + reflections + 0.9.10 + + + + org.scala-lang + scala-library + ${scala.version} + + + + org.apache.oozie + oozie-client + 5.1.0 + + + com.jayway.jsonpath + json-path + 2.4.0 + + + + + + + + + + + + +