From 12fb11c8c13c48d61c9be1765e08086e71b87ea3 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 27 Apr 2023 21:37:05 +0200 Subject: [PATCH] refactoring and implementation of scala feature extractor --- .../test.properties | 2 +- dnet-and-test/job-override.properties | 34 ++- dnet-and-test/pom.xml | 12 - .../eu/dnetlib/jobs/AbstractSparkJob.java | 2 +- .../eu/dnetlib/jobs/SparkCountVectorizer.java | 1 - .../dnetlib/jobs/SparkCreateVocabulary.java | 2 - .../deeplearning/SparkCreateGroupDataSet.java | 75 ------ .../SparkGraphClassificationTraining.java | 99 ------- .../SparkAuthorExtractor.java | 50 +++- .../SparkPublicationFeatureExtractor.java | 107 ++++++++ .../lda}/SparkLDAAnalysis.java | 5 +- .../lda}/SparkLDAInference.java | 5 +- .../lda}/SparkLDATuning.java | 5 +- .../oozie_app/config-default.xml | 18 ++ .../feature_extraction/oozie_app/workflow.xml | 172 ++++++++++++ .../authorExtractor_parameters.json | 22 +- ...raphClassificationTraining_parameters.json | 14 - ...ublicationFeatureExtractor_parameters.json | 44 +++ .../lda_inference/oozie_app/workflow.xml | 57 +--- .../lda_tuning/oozie_app/workflow.xml | 2 +- .../FeatureExtractionJobTest.java} | 51 ++-- .../lda}/LDAAnalysisTest.java | 70 +---- .../._SUCCESS.crc | Bin 0 -> 8 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes ...-a993-0cfde5f36081-c000.snappy.parquet.crc | Bin 0 -> 48 bytes .../publications_lda_topics_subset/_SUCCESS | 0 ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4790 bytes ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4742 bytes ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4751 bytes ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4775 bytes ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4710 bytes ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4806 bytes ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4714 bytes ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4710 bytes ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4701 bytes ...4ab9-a993-0cfde5f36081-c000.snappy.parquet | Bin 0 -> 4630 bytes dnet-feature-extraction/pom.xml | 90 ++++--- .../dnetlib/deeplearning/GroupClassifier.java | 130 --------- .../layers/GraphConvolutionVertex.java | 24 -- .../layers/GraphGlobalAddPool.java | 21 -- .../support/DataSetProcessor.java | 88 ------ .../support/GroupMultiDataSet.java | 11 - .../support/NetworkConfigurations.java | 97 ------- .../deeplearning/support/PlotUtils.java | 253 ------------------ .../java/eu/dnetlib/example/Example.scala | 76 ++++++ .../featureextraction/FeatureTransformer.java | 19 +- .../ScalaFeatureTransformer.scala | 157 +++++++++++ .../main/java/eu/dnetlib/support/Author.java | 33 +-- .../eu/dnetlib/support/AuthorsFactory.java | 32 ++- .../dnetlib/support/ConnectedComponent.java | 70 +++++ .../java/eu/dnetlib/support/Relation.java | 52 ++++ .../src/test/java/UtilityTest.java | 3 +- .../deeplearning/DataSetProcessorTest.java | 47 ---- .../NetworkConfigurationTests.java | 33 --- .../FeatureTransformerTest.java | 53 ++++ pom.xml | 165 ++++++------ 64 files changed, 1061 insertions(+), 1242 deletions(-) delete mode 100644 dnet-and-test/src/main/java/eu/dnetlib/jobs/deeplearning/SparkCreateGroupDataSet.java delete mode 100644 dnet-and-test/src/main/java/eu/dnetlib/jobs/deeplearning/SparkGraphClassificationTraining.java rename dnet-and-test/src/main/java/eu/dnetlib/jobs/{ => featureextraction}/SparkAuthorExtractor.java (50%) create mode 100644 dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/SparkPublicationFeatureExtractor.java rename dnet-and-test/src/main/java/eu/dnetlib/jobs/{ => featureextraction/lda}/SparkLDAAnalysis.java (96%) rename dnet-and-test/src/main/java/eu/dnetlib/jobs/{ => featureextraction/lda}/SparkLDAInference.java (93%) rename dnet-and-test/src/main/java/eu/dnetlib/jobs/{ => featureextraction/lda}/SparkLDATuning.java (95%) create mode 100644 dnet-and-test/src/main/resources/feature_extraction/oozie_app/config-default.xml create mode 100644 dnet-and-test/src/main/resources/feature_extraction/oozie_app/workflow.xml delete mode 100644 dnet-and-test/src/main/resources/jobs/parameters/graphClassificationTraining_parameters.json create mode 100644 dnet-and-test/src/main/resources/jobs/parameters/publicationFeatureExtractor_parameters.json rename dnet-and-test/src/test/java/eu/dnetlib/jobs/{deeplearning/GNNTrainingTest.java => featureextraction/FeatureExtractionJobTest.java} (63%) rename dnet-and-test/src/test/java/eu/dnetlib/jobs/{ => featureextraction/lda}/LDAAnalysisTest.java (68%) create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/._SUCCESS.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00000-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00001-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00002-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00003-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00004-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00005-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00006-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00007-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00008-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00009-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/_SUCCESS create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00000-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00001-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00002-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00003-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00004-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00005-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00006-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00007-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00008-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet create mode 100644 dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00009-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet delete mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/GroupClassifier.java delete mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/layers/GraphConvolutionVertex.java delete mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/layers/GraphGlobalAddPool.java delete mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/DataSetProcessor.java delete mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/GroupMultiDataSet.java delete mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/NetworkConfigurations.java delete mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/PlotUtils.java create mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/example/Example.scala create mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/ScalaFeatureTransformer.scala create mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/support/ConnectedComponent.java create mode 100644 dnet-feature-extraction/src/main/java/eu/dnetlib/support/Relation.java delete mode 100644 dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/DataSetProcessorTest.java delete mode 100644 dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/NetworkConfigurationTests.java create mode 100644 dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/featureextraction/FeatureTransformerTest.java diff --git a/dhp-build/dhp-build-properties-maven-plugin/test.properties b/dhp-build/dhp-build-properties-maven-plugin/test.properties index 0573300..a584f65 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/test.properties +++ b/dhp-build/dhp-build-properties-maven-plugin/test.properties @@ -1,2 +1,2 @@ -# Thu Apr 13 16:22:22 CEST 2023 +# Thu Apr 27 21:12:07 CEST 2023 projectPropertyKey=projectPropertyValue diff --git a/dnet-and-test/job-override.properties b/dnet-and-test/job-override.properties index 99d28c3..09cb572 100644 --- a/dnet-and-test/job-override.properties +++ b/dnet-and-test/job-override.properties @@ -11,11 +11,31 @@ #outputModelPath = /user/michele.debonis/lda_experiments/lda_dewey2.model #LDA INFERENCE +#numPartitions = 1000 +#inputFieldJPath = $.description[0].value +#vocabularyPath = /user/michele.debonis/lda_experiments/dewey_vocabulary +#entitiesPath = /tmp/publications_with_pid_pubmed +#workingPath = /user/michele.debonis/lda_experiments/lda_inference_working_dir +#ldaInferencePath = /user/michele.debonis/lda_experiments/publications_pubmed_topics +#ldaModelPath = /user/michele.debonis/lda_experiments/lda_dewey.model +#authorsPath = /user/michele.debonis/lda_experiments/authors_pubmed + +#GNN TRAINING +#groupsPath = /user/michele.debonis/authors_dedup/gt_dedup/groupentities +#workingPath = /user/michele.debonis/gnn_experiments +#numPartitions = 1000 +#numEpochs = 100 +#groundTruthJPath = $.orcid +#idJPath = $.id +#featuresJPath = $.topics + +#FEATURE EXTRACTION +publicationsPath = /tmp/publications_with_pid_pubmed +workingPath = /user/michele.debonis/feature_extraction numPartitions = 1000 -inputFieldJPath = $.description[0].value -vocabularyPath = /user/michele.debonis/lda_experiments/dewey_vocabulary -entitiesPath = /tmp/publications_with_pid_pubmed -workingPath = /user/michele.debonis/lda_experiments/lda_inference_working_dir -ldaInferencePath = /user/michele.debonis/lda_experiments/publications_pubmed_topics -ldaModelPath = /user/michele.debonis/lda_experiments/lda_dewey.model -authorsPath = /user/michele.debonis/lda_experiments/authors_pubmed \ No newline at end of file +featuresPath = /user/michele.debonis/feature_extraction/publications_pubmed_features +topicsPath = /user/michele.debonis/lda_experiments/publications_pubmed_topics +outputPath = /user/michele.debonis/feature_extraction/authors_pubmed +wordEmbeddingsModel = /user/michele.debonis/nlp_models/glove_100d_en_2.4.0_2.4_1579690104032 +bertSentenceModel = /user/michele.debonis/nlp_models/sent_small_bert_L6_512_en_2.6.0_2.4_1598350624049 +bertModel = /user/michele.debonis/nlp_models/small_bert_L2_128_en_2.6.0_2.4_1598344320681 \ No newline at end of file diff --git a/dnet-and-test/pom.xml b/dnet-and-test/pom.xml index ddc3e22..7104cba 100644 --- a/dnet-and-test/pom.xml +++ b/dnet-and-test/pom.xml @@ -131,23 +131,11 @@ json-path - eu.dnetlib.dhp dhp-schemas - - - - - - - - - - - diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java index ff21a8e..a1b9737 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java @@ -36,7 +36,7 @@ public abstract class AbstractSparkJob implements Serializable { this.spark = spark; } - protected abstract void run() throws IOException; + protected abstract void run() throws IOException, InterruptedException; protected static SparkSession getSparkSession(SparkConf conf) { return SparkSession.builder().config(conf).getOrCreate(); diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java index ef1912c..d14b76a 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java @@ -2,7 +2,6 @@ package eu.dnetlib.jobs; import eu.dnetlib.featureextraction.FeatureTransformer; import eu.dnetlib.support.ArgumentApplicationParser; -import org.apache.hadoop.fs.shell.Count; import org.apache.spark.SparkConf; import org.apache.spark.ml.feature.CountVectorizerModel; import org.apache.spark.sql.Dataset; diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java index 4b5cdf8..45b3011 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java @@ -9,8 +9,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.file.Paths; import java.util.Optional; public class SparkCreateVocabulary extends AbstractSparkJob{ diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/deeplearning/SparkCreateGroupDataSet.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/deeplearning/SparkCreateGroupDataSet.java deleted file mode 100644 index e02f5ae..0000000 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/deeplearning/SparkCreateGroupDataSet.java +++ /dev/null @@ -1,75 +0,0 @@ -package eu.dnetlib.jobs.deeplearning; - -import eu.dnetlib.deeplearning.support.DataSetProcessor; -import eu.dnetlib.jobs.AbstractSparkJob; -import eu.dnetlib.jobs.SparkLDATuning; -import eu.dnetlib.support.ArgumentApplicationParser; -import eu.dnetlib.support.ConnectedComponent; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; -import org.codehaus.jackson.map.ObjectMapper; -import org.deeplearning4j.spark.data.BatchAndExportDataSetsFunction; -import org.deeplearning4j.spark.data.BatchAndExportMultiDataSetsFunction; -import org.deeplearning4j.spark.datavec.iterator.IteratorUtils; -import org.nd4j.linalg.dataset.MultiDataSet; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.List; -import java.util.Optional; - -public class SparkCreateGroupDataSet extends AbstractSparkJob { - - private static final Logger log = LoggerFactory.getLogger(SparkCreateGroupDataSet.class); - - public SparkCreateGroupDataSet(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - readResource("/jobs/parameters/createGroupDataset_parameters.json", SparkLDATuning.class) - ); - - parser.parseArgument(args); - - SparkConf conf = new SparkConf(); - - new SparkCreateGroupDataSet( - parser, - getSparkSession(conf) - ).run(); - } - - @Override - public void run() throws IOException { - // read oozie parameters - final String groupsPath = parser.get("groupsPath"); - final String workingPath = parser.get("workingPath"); - final String groundTruthJPath = parser.get("groundTruthJPath"); - final String idJPath = parser.get("idJPath"); - final String featuresJPath = parser.get("featuresJPath"); - final int numPartitions = Optional - .ofNullable(parser.get("numPartitions")) - .map(Integer::valueOf) - .orElse(NUM_PARTITIONS); - - log.info("groupsPath: '{}'", groupsPath); - log.info("workingPath: '{}'", workingPath); - log.info("groundTruthJPath: '{}'", groundTruthJPath); - log.info("idJPath: '{}'", idJPath); - log.info("featuresJPath: '{}'", featuresJPath); - log.info("numPartitions: '{}'", numPartitions); - - JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - JavaRDD groups = context.textFile(groupsPath).map(g -> new ObjectMapper().readValue(g, ConnectedComponent.class)); - - JavaRDD dataset = DataSetProcessor.entityGroupToMultiDataset(groups, idJPath, featuresJPath, groundTruthJPath); - - dataset.saveAsObjectFile(workingPath + "/groupDataset"); - } - -} diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/deeplearning/SparkGraphClassificationTraining.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/deeplearning/SparkGraphClassificationTraining.java deleted file mode 100644 index e0772ae..0000000 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/deeplearning/SparkGraphClassificationTraining.java +++ /dev/null @@ -1,99 +0,0 @@ -package eu.dnetlib.jobs.deeplearning; - -import eu.dnetlib.deeplearning.support.DataSetProcessor; -import eu.dnetlib.deeplearning.support.NetworkConfigurations; -import eu.dnetlib.jobs.AbstractSparkJob; -import eu.dnetlib.jobs.SparkLDATuning; -import eu.dnetlib.support.ArgumentApplicationParser; -import eu.dnetlib.support.ConnectedComponent; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; -import org.codehaus.jackson.map.ObjectMapper; -import org.deeplearning4j.nn.conf.ComputationGraphConfiguration; -import org.deeplearning4j.nn.graph.ComputationGraph; -import org.deeplearning4j.optimize.listeners.PerformanceListener; -import org.deeplearning4j.optimize.solvers.accumulation.encoding.threshold.AdaptiveThresholdAlgorithm; -import org.deeplearning4j.spark.api.RDDTrainingApproach; -import org.deeplearning4j.spark.api.TrainingMaster; -import org.deeplearning4j.spark.impl.graph.SparkComputationGraph; -import org.deeplearning4j.spark.parameterserver.training.SharedTrainingMaster; -import org.nd4j.linalg.dataset.api.MultiDataSet; -import org.nd4j.parameterserver.distributed.conf.VoidConfiguration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Optional; - -public class SparkGraphClassificationTraining extends AbstractSparkJob { - - private static final Logger log = LoggerFactory.getLogger(SparkGraphClassificationTraining.class); - - public SparkGraphClassificationTraining(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - readResource("/jobs/parameters/graphClassificationTraining_parameters.json", SparkLDATuning.class) - ); - - parser.parseArgument(args); - - SparkConf conf = new SparkConf(); - - new SparkGraphClassificationTraining( - parser, - getSparkSession(conf) - ).run(); - } - - @Override - public void run() throws IOException { - // read oozie parameters - final String workingPath = parser.get("workingPath"); - final int numPartitions = Optional - .ofNullable(parser.get("numPartitions")) - .map(Integer::valueOf) - .orElse(NUM_PARTITIONS); - log.info("workingPath: '{}'", workingPath); - log.info("numPartitions: '{}'", numPartitions); - - JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - VoidConfiguration conf = VoidConfiguration.builder() - .unicastPort(40123) -// .networkMask("255.255.148.0/22") - .controllerAddress("127.0.0.1") - .build(); - - TrainingMaster trainingMaster = new SharedTrainingMaster.Builder(conf,1) - .rngSeed(12345) - .collectTrainingStats(false) - .thresholdAlgorithm(new AdaptiveThresholdAlgorithm(1e-3)) - .batchSizePerWorker(32) - .workersPerNode(4) - .rddTrainingApproach(RDDTrainingApproach.Direct) - .build(); - - JavaRDD trainData = context.objectFile(workingPath + "/groupDataset"); - - SparkComputationGraph sparkComputationGraph = new SparkComputationGraph( - context, - NetworkConfigurations.getSimpleGCN(3, 2, 5, 2), - trainingMaster); - sparkComputationGraph.setListeners(new PerformanceListener(10, true)); - - //execute training - for (int i = 0; i < 20; i ++) { - sparkComputationGraph.fitMultiDataSet(trainData); - } - - ComputationGraph network = sparkComputationGraph.getNetwork(); - - System.out.println("network = " + network.getConfiguration().toJson()); - - - } -} diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkAuthorExtractor.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/SparkAuthorExtractor.java similarity index 50% rename from dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkAuthorExtractor.java rename to dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/SparkAuthorExtractor.java index 5de5fc0..e014cea 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkAuthorExtractor.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/SparkAuthorExtractor.java @@ -1,5 +1,8 @@ -package eu.dnetlib.jobs; +package eu.dnetlib.jobs.featureextraction; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.jobs.AbstractSparkJob; +import eu.dnetlib.jobs.SparkTokenizer; import eu.dnetlib.support.ArgumentApplicationParser; import eu.dnetlib.support.Author; import eu.dnetlib.support.AuthorsFactory; @@ -10,15 +13,18 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.linalg.DenseVector; import org.apache.spark.sql.SparkSession; +import org.codehaus.jackson.map.DeserializationConfig; import org.codehaus.jackson.map.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; import java.io.IOException; +import java.util.HashMap; +import java.util.Map; import java.util.Optional; -public class SparkAuthorExtractor extends AbstractSparkJob{ +public class SparkAuthorExtractor extends AbstractSparkJob { private static final Logger log = LoggerFactory.getLogger(SparkAuthorExtractor.class); public SparkAuthorExtractor(ArgumentApplicationParser parser, SparkSession spark) { @@ -45,7 +51,8 @@ public class SparkAuthorExtractor extends AbstractSparkJob{ public void run() throws IOException { // read oozie parameters final String topicsPath = parser.get("topicsPath"); - final String entitiesPath = parser.get("entitiesPath"); + final String featuresPath = parser.get("featuresPath"); + final String publicationsPath = parser.get("publicationsPath"); final String workingPath = parser.get("workingPath"); final String outputPath = parser.get("outputPath"); final int numPartitions = Optional @@ -53,25 +60,44 @@ public class SparkAuthorExtractor extends AbstractSparkJob{ .map(Integer::valueOf) .orElse(NUM_PARTITIONS); - log.info("entitiesPath: '{}'", entitiesPath); - log.info("topicsPath: '{}'", topicsPath); - log.info("workingPath: '{}'", workingPath); - log.info("outputPath: '{}'", outputPath); - log.info("numPartitions: '{}'", numPartitions); + log.info("publicationsPath: '{}'", publicationsPath); + log.info("topicsPath: '{}'", topicsPath); + log.info("featuresPath: '{}'", featuresPath); + log.info("workingPath: '{}'", workingPath); + log.info("outputPath: '{}'", outputPath); + log.info("numPartitions: '{}'", numPartitions); //join publications with topics JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD entities = context.textFile(entitiesPath); + JavaRDD publications = context + .textFile(publicationsPath) + .map(x -> new ObjectMapper() + .configure(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false) + .readValue(x, Publication.class)); - JavaPairRDD topics = spark.read().load(topicsPath).toJavaRDD() - .mapToPair(t -> new Tuple2<>(t.getString(0), (DenseVector) t.get(1))); + JavaPairRDD topics = spark.read().load(topicsPath).toJavaRDD() + .mapToPair(t -> new Tuple2<>(t.getString(0), ((DenseVector) t.get(1)).toArray())); - JavaRDD authors = AuthorsFactory.extractAuthorsFromPublications(entities, topics); + //merge topics with other embeddings + JavaPairRDD> publicationEmbeddings = spark.read().load(featuresPath).toJavaRDD().mapToPair(t -> { + Map embeddings = new HashMap<>(); + embeddings.put("word_embeddings", ((DenseVector) t.get(1)).toArray()); + embeddings.put("bert_embeddings", ((DenseVector) t.get(2)).toArray()); + embeddings.put("bert_sentence_embeddings", ((DenseVector) t.get(3)).toArray()); + return new Tuple2<>(t.getString(0), embeddings); + }) + .join(topics).mapToPair(e -> { + e._2()._1().put("lda_topics", e._2()._2()); + return new Tuple2<>(e._1(), e._2()._1()); + }); + + JavaRDD authors = AuthorsFactory.extractAuthorsFromPublications(publications, publicationEmbeddings); authors .map(a -> new ObjectMapper().writeValueAsString(a)) .saveAsTextFile(outputPath, GzipCodec.class); } + } diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/SparkPublicationFeatureExtractor.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/SparkPublicationFeatureExtractor.java new file mode 100644 index 0000000..ab22c58 --- /dev/null +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/SparkPublicationFeatureExtractor.java @@ -0,0 +1,107 @@ +package eu.dnetlib.jobs.featureextraction; + +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.featureextraction.ScalaFeatureTransformer; +import eu.dnetlib.jobs.AbstractSparkJob; +import eu.dnetlib.support.ArgumentApplicationParser; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.*; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.codehaus.jackson.map.DeserializationConfig; +import org.codehaus.jackson.map.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Optional; +import java.util.stream.Collectors; + +public class SparkPublicationFeatureExtractor extends AbstractSparkJob { + private static final Logger log = LoggerFactory.getLogger(SparkPublicationFeatureExtractor.class); + + public SparkPublicationFeatureExtractor(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + readResource("/jobs/parameters/publicationFeatureExtractor_parameters.json", SparkPublicationFeatureExtractor.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkAuthorExtractor( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() throws IOException { + // read oozie parameters + final String publicationsPath = parser.get("publicationsPath"); + final String workingPath = parser.get("workingPath"); + final String featuresPath = parser.get("featuresPath"); + final String bertModel = parser.get("bertModelPath"); + final String bertSentenceModel = parser.get("bertSentenceModel"); + final String wordEmbeddingModel = parser.get("wordEmbeddingModel"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("publicationsPath: '{}'", publicationsPath); + log.info("workingPath: '{}'", workingPath); + log.info("numPartitions: '{}'", numPartitions); + + JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD publications = context + .textFile(publicationsPath) + .map(x -> new ObjectMapper() + .configure(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false) + .readValue(x, Publication.class)); + + StructType inputSchema = new StructType(new StructField[]{ + new StructField("id", DataTypes.StringType, false, Metadata.empty()), + new StructField("title", DataTypes.StringType, false, Metadata.empty()), + new StructField("abstract", DataTypes.StringType, false, Metadata.empty()), + new StructField("subjects", DataTypes.StringType, false, Metadata.empty()) + }); + + //prepare Rows + Dataset inputData = spark.createDataFrame( + publications.map(p -> RowFactory.create( + p.getId(), + p.getTitle().get(0).getValue(), + p.getDescription().size()>0? p.getDescription().get(0).getValue(): "", + p.getSubject().stream().map(StructuredProperty::getValue).collect(Collectors.joining(" ")))), + inputSchema); + + log.info("Generating word embeddings"); + Dataset wordEmbeddingsData = ScalaFeatureTransformer.wordEmbeddings(inputData, "subjects", wordEmbeddingModel); + + log.info("Generating bert embeddings"); + Dataset bertEmbeddingsData = ScalaFeatureTransformer.bertEmbeddings(wordEmbeddingsData, "title", bertModel); + + log.info("Generating bert sentence embeddings"); + Dataset bertSentenceEmbeddingsData = ScalaFeatureTransformer.bertSentenceEmbeddings(bertEmbeddingsData, "abstract", bertSentenceModel); + + Dataset features = bertSentenceEmbeddingsData.select("id", ScalaFeatureTransformer.WORD_EMBEDDINGS_COL(), ScalaFeatureTransformer.BERT_EMBEDDINGS_COL(), ScalaFeatureTransformer.BERT_SENTENCE_EMBEDDINGS_COL()); + + features + .write() + .mode(SaveMode.Overwrite) + .save(featuresPath); + + } +} diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDAAnalysis.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/lda/SparkLDAAnalysis.java similarity index 96% rename from dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDAAnalysis.java rename to dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/lda/SparkLDAAnalysis.java index ececa9b..c0ee71f 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDAAnalysis.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/lda/SparkLDAAnalysis.java @@ -1,7 +1,8 @@ -package eu.dnetlib.jobs; +package eu.dnetlib.jobs.featureextraction.lda; import com.clearspring.analytics.util.Lists; import eu.dnetlib.featureextraction.Utilities; +import eu.dnetlib.jobs.AbstractSparkJob; import eu.dnetlib.support.ArgumentApplicationParser; import eu.dnetlib.support.Author; import eu.dnetlib.support.AuthorsFactory; @@ -108,7 +109,7 @@ public class SparkLDAAnalysis extends AbstractSparkJob { else { bRes = authors.get(i).getOrcid().equals(authors.get(j).getOrcid()); } - results.add(new Tuple2<>(bRes, cosineSimilarity(authors.get(i).getTopics(), authors.get(j).getTopics()))); + results.add(new Tuple2<>(bRes, cosineSimilarity(authors.get(i).getEmbeddings().get("lda_topics"), authors.get(j).getEmbeddings().get("lda_topics")))); j++; } i++; diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDAInference.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/lda/SparkLDAInference.java similarity index 93% rename from dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDAInference.java rename to dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/lda/SparkLDAInference.java index 83814a1..797c386 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDAInference.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/lda/SparkLDAInference.java @@ -1,6 +1,7 @@ -package eu.dnetlib.jobs; +package eu.dnetlib.jobs.featureextraction.lda; import eu.dnetlib.featureextraction.FeatureTransformer; +import eu.dnetlib.jobs.AbstractSparkJob; import eu.dnetlib.support.ArgumentApplicationParser; import org.apache.spark.SparkConf; import org.apache.spark.ml.clustering.LDAModel; @@ -15,7 +16,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.*; -public class SparkLDAInference extends AbstractSparkJob{ +public class SparkLDAInference extends AbstractSparkJob { private static final Logger log = LoggerFactory.getLogger(SparkLDAInference.class); diff --git a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDATuning.java b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/lda/SparkLDATuning.java similarity index 95% rename from dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDATuning.java rename to dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/lda/SparkLDATuning.java index b4ac8b9..443c30e 100644 --- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDATuning.java +++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/featureextraction/lda/SparkLDATuning.java @@ -1,7 +1,8 @@ -package eu.dnetlib.jobs; +package eu.dnetlib.jobs.featureextraction.lda; import eu.dnetlib.featureextraction.FeatureTransformer; import eu.dnetlib.featureextraction.Utilities; +import eu.dnetlib.jobs.AbstractSparkJob; import eu.dnetlib.support.ArgumentApplicationParser; import org.apache.spark.SparkConf; import org.apache.spark.ml.clustering.LDAModel; @@ -15,7 +16,7 @@ import scala.Tuple2; import java.io.IOException; import java.util.*; -public class SparkLDATuning extends AbstractSparkJob{ +public class SparkLDATuning extends AbstractSparkJob { private static final Logger log = LoggerFactory.getLogger(SparkLDATuning.class); diff --git a/dnet-and-test/src/main/resources/feature_extraction/oozie_app/config-default.xml b/dnet-and-test/src/main/resources/feature_extraction/oozie_app/config-default.xml new file mode 100644 index 0000000..2e0ed9a --- /dev/null +++ b/dnet-and-test/src/main/resources/feature_extraction/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/feature_extraction/oozie_app/workflow.xml b/dnet-and-test/src/main/resources/feature_extraction/oozie_app/workflow.xml new file mode 100644 index 0000000..2aa77c0 --- /dev/null +++ b/dnet-and-test/src/main/resources/feature_extraction/oozie_app/workflow.xml @@ -0,0 +1,172 @@ + + + + publicationsPath + the input entity path + + + workingPath + path for the working directory + + + numPartitions + number of partitions for the spark files + + + featuresPath + location of the embeddings + + + topicsPath + location of the topics + + + outputPath + location of the output authors + + + bertModel + location of the bert model + + + bertSentenceModel + location of the bert sentence model + + + wordEmbeddingsModel + location of the word embeddings model + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + yarn + cluster + Publication Feature Extraction + eu.dnetlib.jobs.featureextraction.SparkPublicationFeatureExtractor + dnet-and-test-${projectVersion}.jar + + --num-executors=32 + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.dynamicAllocation.enabled=false + + --publicationsPath${publicationsPath} + --workingPath${workingPath} + --numPartitions${numPartitions} + --featuresPath${featuresPath} + --wordEmbeddingsModel${wordEmbeddingsModel} + --bertModel${bertModel} + --bertSentenceModel${bertSentenceModel} + + + + + + + + yarn + cluster + Author Extraction + eu.dnetlib.jobs.featureextraction.SparkAuthorExtractor + dnet-and-test-${projectVersion}.jar + + --num-executors=32 + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.dynamicAllocation.enabled=true + + --workingPath${workingPath} + --numPartitions${numPartitions} + --publicationsPath${publicationsPath} + --topicsPath${topicsPath} + --featuresPath${featuresPath} + --outputPath${outputPath} + + + + + + + \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/jobs/parameters/authorExtractor_parameters.json b/dnet-and-test/src/main/resources/jobs/parameters/authorExtractor_parameters.json index 4905b64..bee9c78 100644 --- a/dnet-and-test/src/main/resources/jobs/parameters/authorExtractor_parameters.json +++ b/dnet-and-test/src/main/resources/jobs/parameters/authorExtractor_parameters.json @@ -6,11 +6,23 @@ "paramRequired": true }, { - "paramName": "e", - "paramLongName": "entitiesPath", + "paramName": "p", + "paramLongName": "publicationsPath", "paramDescription": "location of the input entities", "paramRequired": true }, + { + "paramName": "t", + "paramLongName": "topicsPath", + "paramDescription": "location of the lda topics", + "paramRequired": true + }, + { + "paramName": "f", + "paramLongName": "featuresPath", + "paramDescription": "location of the features", + "paramRequired": true + }, { "paramName": "np", "paramLongName": "numPartitions", @@ -22,11 +34,5 @@ "paramLongName": "outputPath", "paramDescription": "location of the output author extracted", "paramRequired": false - }, - { - "paramName": "t", - "paramLongName": "topicsPath", - "paramDescription": "location of the lda topics", - "paramRequired": false } ] \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/jobs/parameters/graphClassificationTraining_parameters.json b/dnet-and-test/src/main/resources/jobs/parameters/graphClassificationTraining_parameters.json deleted file mode 100644 index 8e6b4d2..0000000 --- a/dnet-and-test/src/main/resources/jobs/parameters/graphClassificationTraining_parameters.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "paramName": "w", - "paramLongName": "workingPath", - "paramDescription": "path of the working directory", - "paramRequired": true - }, - { - "paramName": "np", - "paramLongName": "numPartitions", - "paramDescription": "number of partitions for the similarity relations intermediate phases", - "paramRequired": false - } -] \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/jobs/parameters/publicationFeatureExtractor_parameters.json b/dnet-and-test/src/main/resources/jobs/parameters/publicationFeatureExtractor_parameters.json new file mode 100644 index 0000000..db5af73 --- /dev/null +++ b/dnet-and-test/src/main/resources/jobs/parameters/publicationFeatureExtractor_parameters.json @@ -0,0 +1,44 @@ +[ + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", + "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false + }, + { + "paramName": "p", + "paramLongName": "publicationsPath", + "paramDescription": "location of the publications", + "paramRequired": true + }, + { + "paramName": "f", + "paramLongName": "featuresPath", + "paramDescription": "location of the features", + "paramRequired": true + }, + { + "paramName": "we", + "paramLongName": "wordEmbeddingsModel", + "paramDescription": "path of the word embeddings model", + "paramRequired": true + }, + { + "paramName": "bm", + "paramLongName": "bertModel", + "paramDescription": "path of the bert model", + "paramRequired": true + }, + { + "paramName": "bs", + "paramLongName": "bertSentenceModel", + "paramDescription": "path of the bert sentence model", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/lda_inference/oozie_app/workflow.xml b/dnet-and-test/src/main/resources/lda_inference/oozie_app/workflow.xml index 4537200..e16e896 100644 --- a/dnet-and-test/src/main/resources/lda_inference/oozie_app/workflow.xml +++ b/dnet-and-test/src/main/resources/lda_inference/oozie_app/workflow.xml @@ -155,7 +155,7 @@ yarn cluster LDA Inference - eu.dnetlib.jobs.SparkLDAInference + eu.dnetlib.jobs.featureextraction.lda.SparkLDAInference dnet-and-test-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -172,63 +172,8 @@ --ldaModelPath${ldaModelPath} --numPartitions${numPartitions} - - - - - - - yarn - cluster - LDA Inference - eu.dnetlib.jobs.SparkAuthorExtractor - dnet-and-test-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --entitiesPath${entitiesPath} - --workingPath${workingPath} - --outputPath${authorsPath} - --numPartitions${numPartitions} - --topicsPath${ldaInferencePath} - - - - - - - - yarn - cluster - LDA Threshold Analysis - eu.dnetlib.jobs.SparkLDAAnalysis - dnet-and-test-${projectVersion}.jar - - --num-executors=32 - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.dynamicAllocation.enabled=false - - --authorsPath${authorsPath} - --workingPath${workingPath} - --numPartitions${numPartitions} - - \ No newline at end of file diff --git a/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml b/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml index add9e78..e109b7b 100644 --- a/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml +++ b/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml @@ -195,7 +195,7 @@ yarn cluster LDA Tuning - eu.dnetlib.jobs.SparkLDATuning + eu.dnetlib.jobs.featureextraction.lda.SparkLDATuning dnet-and-test-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} diff --git a/dnet-and-test/src/test/java/eu/dnetlib/jobs/deeplearning/GNNTrainingTest.java b/dnet-and-test/src/test/java/eu/dnetlib/jobs/featureextraction/FeatureExtractionJobTest.java similarity index 63% rename from dnet-and-test/src/test/java/eu/dnetlib/jobs/deeplearning/GNNTrainingTest.java rename to dnet-and-test/src/test/java/eu/dnetlib/jobs/featureextraction/FeatureExtractionJobTest.java index 762e175..2a57b5e 100644 --- a/dnet-and-test/src/test/java/eu/dnetlib/jobs/deeplearning/GNNTrainingTest.java +++ b/dnet-and-test/src/test/java/eu/dnetlib/jobs/featureextraction/FeatureExtractionJobTest.java @@ -1,6 +1,7 @@ -package eu.dnetlib.jobs.deeplearning; +package eu.dnetlib.jobs.featureextraction; import eu.dnetlib.jobs.AbstractSparkJob; +import eu.dnetlib.jobs.SparkTokenizer; import eu.dnetlib.support.ArgumentApplicationParser; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; @@ -15,22 +16,22 @@ import java.nio.file.Paths; @TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestInstance(TestInstance.Lifecycle.PER_CLASS) -public class GNNTrainingTest { - +public class FeatureExtractionJobTest { static SparkSession spark; static JavaSparkContext context; final static String workingPath = "/tmp/working_dir"; - final static String numPartitions = "20"; final String inputDataPath = Paths - .get(getClass().getResource("/eu/dnetlib/jobs/examples/authors.groups.example.json").toURI()) + .get(getClass().getResource("/eu/dnetlib/jobs/examples/publications.subset.json").toURI()) .toFile() .getAbsolutePath(); - final static String groundTruthJPath = "$.orcid"; - final static String idJPath = "$.id"; - final static String featuresJPath = "$.topics"; - public GNNTrainingTest() throws URISyntaxException {} + final String ldaTopicsPath = Paths + .get(getClass().getResource("/eu/dnetlib/jobs/examples/publications_lda_topics_subset").toURI()) + .toFile() + .getAbsolutePath(); + + public FeatureExtractionJobTest() throws URISyntaxException {} public static void cleanup() throws IOException { //remove directories and clean workspace @@ -57,43 +58,43 @@ public class GNNTrainingTest { @Test @Order(1) - public void createGroupDataSetTest() throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/createGroupDataset_parameters.json", SparkCreateGroupDataSet.class)); + public void publicationFeatureExtractionTest() throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/publicationFeatureExtractor_parameters.json", SparkTokenizer.class)); parser.parseArgument( new String[] { - "-i", inputDataPath, - "-gt", groundTruthJPath, - "-id", idJPath, - "-f", featuresJPath, + "-p", inputDataPath, "-w", workingPath, - "-np", numPartitions + "-np", "20" } ); - new SparkCreateGroupDataSet( + new SparkPublicationFeatureExtractor( parser, spark ).run(); - } @Test @Order(2) - public void graphClassificationTrainingTest() throws Exception{ - ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/graphClassificationTraining_parameters.json", SparkGraphClassificationTraining.class)); + public void authorExtractionTest() throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/authorExtractor_parameters.json", SparkAuthorExtractor.class)); parser.parseArgument( - new String[] { + new String[]{ + "-p", inputDataPath, "-w", workingPath, - "-np", numPartitions - } - ); + "-np", "20", + "-t", ldaTopicsPath, + "-f", workingPath + "/publication_features", + "-o", workingPath + "/authors" + }); - new SparkGraphClassificationTraining( + new SparkAuthorExtractor( parser, spark ).run(); + } public static String readResource(String path, Class clazz) throws IOException { diff --git a/dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java b/dnet-and-test/src/test/java/eu/dnetlib/jobs/featureextraction/lda/LDAAnalysisTest.java similarity index 68% rename from dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java rename to dnet-and-test/src/test/java/eu/dnetlib/jobs/featureextraction/lda/LDAAnalysisTest.java index 8c59008..80cbf90 100644 --- a/dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java +++ b/dnet-and-test/src/test/java/eu/dnetlib/jobs/featureextraction/lda/LDAAnalysisTest.java @@ -1,5 +1,9 @@ -package eu.dnetlib.jobs; +package eu.dnetlib.jobs.featureextraction.lda; +import eu.dnetlib.jobs.AbstractSparkJob; +import eu.dnetlib.jobs.SparkCountVectorizer; +import eu.dnetlib.jobs.SparkCreateVocabulary; +import eu.dnetlib.jobs.SparkTokenizer; import eu.dnetlib.support.ArgumentApplicationParser; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; @@ -157,74 +161,12 @@ public class LDAAnalysisTest { parser, spark ).run(); - } - - @Test - @Order(6) - public void authorExtractorTest() throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/authorExtractor_parameters.json", SparkLDAInference.class)); - - parser.parseArgument( - new String[]{ - "-e", inputDataPath, - "-o", authorsPath, - "-t", topicsPath, - "-w", workingPath, - "-np", numPartitions - }); - - new SparkAuthorExtractor( - parser, - spark - ).run(); - } - - @Test - @Order(7) - public void ldaAnalysis() throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/ldaAnalysis_parameters.json", SparkLDAAnalysis.class)); - - parser.parseArgument( - new String[]{ - "-i", authorsPath, - "-w", workingPath, - "-np", numPartitions - }); - - new SparkLDAAnalysis( - parser, - spark - ).run(); - - Thread.sleep(1000000000); + Thread.sleep(100000); } public static String readResource(String path, Class clazz) throws IOException { return IOUtils.toString(clazz.getResourceAsStream(path)); } - -// @Test -// public void createVocabulary() { -// -// StructType inputSchema = new StructType(new StructField[]{ -// new StructField("id", DataTypes.StringType, false, Metadata.empty()), -// new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) -// }); -// -// JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); -// JavaRDD rows = sc.textFile("/Users/miconis/Desktop/dewey").map(s -> s.substring(4)).map(s -> Utilities.normalize(s).replaceAll(" ", " ")).filter(s -> !s.contains("unassigned")).map(s -> RowFactory.create("id", s)); -// -// Dataset dataFrame = spark.createDataFrame(rows, inputSchema); -// -// dataFrame = FeatureTransformer.tokenizeData(dataFrame); -// -// JavaRDD map = dataFrame.toJavaRDD().map(r -> r.getList(1)).flatMap(l -> l.iterator()).map(s -> s.toString()).distinct(); -// -// map.coalesce(1).saveAsTextFile("/tmp/vocab_raw"); -// System.out.println("map = " + map.count()); -// System.out.println("dataFrame = " + map.first()); -// } - } diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/._SUCCESS.crc b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00000-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00000-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..e546b9fdf7852904e2f732a073b036dbea90a008 GIT binary patch literal 48 zcmYc;N@ieSU}ES|{Ak^>uWIdusO3m8N$uU6(!d&!Frk GGEAb)P!(DL literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00004-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00004-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..09aeb187847d5f987b0ef2c163f67dd09e671be5 GIT binary patch literal 48 zcmV-00MGwpa$^7h00IE8Zj*XI671vHPRBBs-UqpV7_7A!@0~WT7=847m;@6HgDz%Q Gcv?!=SrexK literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00005-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00005-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..f4d51d0fd88101a8dbf17339ec35cfc207ad854a GIT binary patch literal 48 zcmV-00MGwpa$^7h00IE&SXMZZuhwS3)|C&mTDr9VGul=nC93f|GJ{I;iCu8swD?*R GqF&JEGb$hc<>Raj;@@7goPqSWHctw;K E0JN(Tl>h($ literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00008-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/.part-00008-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..cd57565c69a3cfad9aac2584d824b95593f3d61c GIT binary patch literal 48 zcmV-00MGwpa$^7h00IC#ySx~N2fyBt@OEyPKlRR>%wq;$n2Xy{;nZCJ+5YteW;l{*fT8x1z#BlcC`Q<^5R%VD*kpX&zSc_++2}y{_O$ZX8 zbRLjembwg!qF`zfEXw0PC|Z|l>xR(UqP4;U;j z46mnl9hA4MrpB2lI2=2}wIImrpdCXtm^ccth>^pSc%tinOKVC)qKn3kONxEuqF6)B zpbd`#^Fko!4P#Rvj^N1Z7-2>XwKCFN3KxsugP_h@!ePz`uMMHrV4An7))Pt~P^3|e ze9WWBQJ9i3##)Lagd&e5k~sEMYNic}eauiDY=jX7B;cQqWXv2Ani=g)jH5U*p6gt~ zj4~M;;iw2EGLTEm1^i;dA|oPWa|t_*g+W+zMk5p<4Jib{VJGP|W^tUG0|_t=NXImC zE_f+<>>^4$^T3M69_DmXCWO=x*Tit71%~j_V+?Do50<6d@;Zan%zDpxaF9Cz`hb7S zA#DQjZCOLYUUuydW_}LF@uaR~7iPwL?1Q(F#DF!91jQtVjA%fd;?%3v4T(;`#Z$n6 z0G60Rc7Rc&Ep>qg40*ki?23(zVi{O~L;xDAB*Jx*^Si9ZCiW5ocLG#RBTphrX%x645sw8>k^yN5=3*H!X&e?>aBHH7 zF(XMVT-~dQE{!Gg4theEn0gy)&LfVIh^=IT8v;=}A+|I~2p*8pBv=$lM;IZNfr>m$ zPtNbe2w|!b1AYvZRO^VwhSLCCXl_j1_`D(Tab)wJoooX1Uks{d&I`n~3!Jj_jDpU{ zYsRVIQS6Ap$U!tOBjFGbs|iXk$p;dcmkbeXJ+K~YMvZ0M#TLPqFp@q!FC!@oYmXx> zkpXN4kwQxsIjuD#>CJgn9>P#h45EPkR7sph*h@fMBTGp7)x-p3VrZzy0BO!~B%NFQ z+z4njA}As@4Fc35Fv$$X(i=x4C=@|^141MYI>zZY^Qx?YS|Z|NO?}>I8O0DNXcPsI zq2Xx&)`M!DkdzU`Xz)y6iM8cS#2(t5r=KZc;8Dg2W~mn}&>!VzAxDc*@3}GECnQm zoo5<5LcsPc_FSU$uw}7SzBfNEURB z8PxzLu7kG3%u(lJ>by0v1e?qy2Oc7r6w>i2_a!U22s|R-n^doyAfa{y)&o6^3D?09 z=>N$OfNDx5v^H3j4a`EWXB`<%xn?1kN*B-)>s+azBP+mdC})hv9wF{Q&5y$w1@ zFgG0wgsshaAvjP@l3gHmj)49+L{OUdK?ABWHqej)0ZdaiBr=hMV<#9cz=?rSP%mr- zJw_u+9jey#}~~Fo?83S+uyVAmCi5f``k1JophD7F@!i0TCh%xyTR@ z1!;Fe`^5y@oPsxa=)EApvN`D8I3y`w_e{e6WlLuiX%ks+;Q;tRsh}y)0oXBO242$} z5-mi>EY=3&KnVgwwFg>R0JcKJ@+M8%JFpX6CXNW=!62Z}8!1Ep#%;{NmSUOiN@ggy z71A!!6h1dP0_}lUnPp?*kF6n)x=i*-V%c6;A{sZ2mrmnhjXLfy7ZMyL08(Mz5 zO1*#Y_VZVMr&s-H*yab`_+z))z3|xyt-G@7gYe;zeqQj}jcYKhPmNrJURj|w z&(0tJ%(7l}=knrjjhJ!2n%zA9*c%TXR_`sj_U*A}ty8m~JfbgSEbQ#<=Vf0b2h>n{E6*rJ=%nA`qw@x8Np z)nfd3;ahJ$sCLabxbPx(SZyd=-jrQ;m+DYoEz?W0>XD}}K0Iv9HuWnzX7{8)S=F6* zy?Xr!N^AElJAKl{*tBW>Icw=e%cJ+sbHI%#=O)Bc=QUtLwa;)b0)>iMO&u4()Jc6IfZ zo~bWPzf(2vr|Q}s&8l5znX5b2Z&Gi)`Tjb&I;(aLPaN$yzFN)ra8~W5wY_S^e_u2B z*u48x^OHjtKDFzx>fU-rs{P%)xMIgSz3PMPvg!Me{8$aWX6~W?x*)5zzc6LP z^8CBhv|kp9pREFZX1=%ehg)w}1wUGQ;Fh{xb?<6qE~(6>iZL==U=B5iob05!Vh~@_NAE{rd8jqaDLIlXI_<6 z%?;)MwEv0Q)jj2VeqUJBtHvMgxx8u7D+)hTu(fi;K{d1L#WM=Ow@t;D{p-Rx1G8!? zeeT|ehwf3M-&w!8a_(E|NWP!GeL%N*YTLYqoA+ha@kPVVyy3KGRsPlUx8M52A+_@9 zM{imFZ+ELbQ>Tf$(pmL|x4yDv-Ivy?eThfN*{WBil4Uz;i`!as>+G3r_S?QmgYTq@ zM?^=>3Qxg<-6X6Cfi+X?k;nbvyKo~fVHoGMJDk|V)^GV@ye$S>#KBufk0>aX>= z`2*{lUA^^fU||Ciyr8t85tgQsCzcHKjlRh@XL1tz!X{-h{UeoR~)va^pRyX;Kb~>X=SDZG#BB!^C znhH>2yUkQoRfKxq=-Mi3uAE=dtebK}^$tG%>$LjjS@6EOy|Gc7Mi_^MzN+F>%!-=% z3l>!Mul<z8d>WHS;q5Zwd{%vF+0<&8exsxQhDbjGyIO|7QJX0(G*% z`y=${t6N+3Joxxzlb@*OBk}E==8UdyZkq&zRMhn4xZrcsbHXh?@1|}}dya7+>;Jyz zRiKK`7R^-)5~UZPl<9d*ts^>oYa1Bj2wW|z$%u1IdwruDVFf%>MY-p!7#_yps|@N5 eJlMmxOzL-mmwK)VoSn@mMG#orAa*?LS5uRBp^q#> z0Wn@jLVKo+b<{h72}T;7dc>T5Vs=wg6ng483JedK5`r=1DI^e-2#*6~s$%e3T-{s* zztV9G@26KiE55j?3F|0u7GW-g_n~q`n@BUN0|7amQg|wh^+le#2vHL5S&YV!^ew!Z3wBwMuD%AWep96+yEKkfT1*0Y%Ic9xxXQNG5{m(3HWL*f0nJ zrL_?-6lmG{FjS2Bz$*qZJpvMGh73YPG4BC2ABlj6Q50!Gf+Q>@44~t^W!f9U1VIjy z0MUs1Acz1XC?~$Og(?Jzh1G^w<8BFk|oxE1+=J%aopWJ8|nkRWdZBut(%M7w|_FDvSy9}C?ZQZMarolbQd40 z(1iQmMKQCie)(+Bf@cglrZ|U{1m|4gJiS6nkf2UDil^p^d#v`@YVHYOi?#6{0Z+IA zm|pnS%j75IPzX71(hnD9~j7v zdS*NoLV1OZ25ESBjyOi3?BqK|86H`HL;#(=fU=C`j-o&ZFaQSnv3cES!vIHPxJC}a zJS=7;5D;mBj|eBQ6RP3jjm}MFC0N@G|g*%iMI;H4qK4&H7>!Q(d3iHv5 zffrD$0=DIX#!S(83(hHQ1REzmJ1m?gUA!3ap<^)-7g{v11@Q1C<#FPfCGVzMF<3=}p5To5i10OpgchNKzLK~O4~0Y2mmME(oI7-Cw=a`B_ z6@jsg_{)(g`2U#kPpvJymxj(;!wPBCMvD1F6)cESoO|h zIayr>$qKK+s=pdKq7tq~Js8Ae|9x@suwuA46feHy&Ff}nB*O>rP0g{%IhENQ;=>G$M}3B}L7c;^~9ch^R< z^24k=W>!x*e#>2Qf5Xauz5l|TeDRTKqwh+rlJn;uyXdORa`Ndr)~5aQtL3csXT9{& zgq%D!Y~*K4wmv5(J=xQD?d|W$_7V5}!Y^GTx9tAq9hV=^$`$3`pR)PyHpyL|O|M$= z$0IVn|EVeSFa3c$J*7*!fAh)6Tkn#|Oikr&u-@-RN3Si~{g6D+npyjgld`gZ^3*@% z=5CZXoO?ylmF3Xiwio(7ePfsG{mrg>`xYOPl^^X~e`NoiGF-D`#fix|S$3>$`1xq1 zoO$AggE#q{Jh|Y$|Gc^8W?9)e<;|bmmy=iY_3eL#JRmEikVaKKt2wPpy@m z|4^5VznYc*(zxgF2Rr&@?XKX>{Rgu0TeUZmU+&o?Z>+uj>Lt$|k(;ZhY^K3pxvuN# zb#ME(<+z(48PRK($hW@r@VRZPb23$2hDLt2U5-2X>Cbz%zb_w}cg?|nzG{nHx?)7} z>b-|0jy_!Y$%zMK>)mx>$8}kG);SMOZR>qZp119v_$%m$yyo3IW?#LyU(VZ9I(PoT ztbB7xY}&66u9NEc$ye|m7?%{jTe)E}O%)&fDI8U!Hh!`_XCfRdV>ps^=KYbNz;y zC2b?`msQnE#C@TM6 zKJ-tse-$5cV0+O9`PRmlKiD%WD|g=^OSeAIFMnQo_iX!iR@P7b&dV(JsJyVM>e7#% zIwE(*Mo;NDvQ8e&UGSIj(9Pr#k50I7?;g4D*tmH*@s4~ZbBliY=sHPf{#W@kk7Q+{ zsOE{u6lMiW*i{8mq|{mME%9SKSg+Jy<`kY~ughm$aH%us~=7 zMlH3CxpK>j>tG?-)1@&#JHGeB*;p4xCrP}7e zbz65!i_$G{A0BiK^`~>zH!WDWuwn4oGk89jYH;Oln_t$ns4iHw{*2;VTN+zZZK`EX zW73&)d*^qXXFzyI@1#=?bo351`Q>rY`iw$@gUfwQqlF#^Th+a3_k~8iYU%pwPV;OU zY_2}lmUeSo=NF#;TA)q|d@#dcy{fZQ^}@%eL_W~X=klBOwzNvMbxnnV)HfAsT=+MS zXTUA~)u_V(7}Sl|-KiE^ tXE=P97GcQqN8<=7_{yT;ivt`mBRKvU;yuP5b2B0K;~}xwH24qH{{YOy?9BiG literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00002-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00002-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9bc1662a161da3bf99cd569f804206b859ddac0b GIT binary patch literal 4751 zcmcgwdvul6mA^NH5HY-znc-~$n>h%P8lg~Iq?Dr!NNt({qpb*z<&5vaq$5^hH7ID)9NZ-CVLn$ACSSJu7X z_nmY0+55MD`?t@RX%nX7R3fzurG8ymlgh$nQL4D240F`k(wO#hPJZg@g5yyj*2f_P zA+RM_C?!H*h8l`2J-Ni4S}~`t4p|>D>IHEol8!~hJ4`8&G)7UNIFu&fb4%I80{F=$ zF?^rebx`t_x;idxjJWrTb0wUxOvTF6$b0HY2p;39Jkd#+@`yMoaAdh+j#J45k8q^m znU6_oUcwE!cG(5MoGhBc(w&AUt(mq7$4|M!1Nnh&qn3VEM}UMzY8e!%RSuaS6|*XC9l# zVCQIljuR0_MpCYLOk^Aww$=qnJtg1*ieXMdChxzN8;63W-V){!RFRPoK9`DGF@0X5!&yO{ zrIM2nVdA|M!dV%!7!!nI)Bt9Xq|_?zF(Z^x6sd?>5mgSj-R`*@#G^3mu7MO$~Ru8^P?t%fLo7l#dNyQ|)@QvEGWQUXjnP9mSR3aByB2b*KJE#E94Q?2p z?C>Ud=eVcba>lLn0G_efMIs_7at&JwI;CMqML;S;f{cyADj>svfy^lHte<{bvLliN ztVc+Om_s(X2^@+daN(qv7K6hb4#^mdco`sDoX4KUV8OuzA2Z|{+Vb$mR+^Zym{Avz z$T03`L^x-dVn6F23z5LUYEB5_ipm>jNk}VKgvC+%){6UnfP%V9%6-KhPkD7zM&YR^VDmWb!2DAT5w~>O>HV5C`%@02OIGq}thr z)rk%woMXmAp<&|UIQWQi!XgkGPE3Rv)+akcWT^7iTgwTyK)T51WhVryg0&w0kI>8h&&Ld?B0`XusK3xOC2F?p8 zC59mADMo@O%P0~8Q~i@gtwacnFw z8zd%hz?0KRLL5`zr1ia3}U)PbM?q-m?Yr+){@mo5szQ_xam z2*?k&&>}qIh^sJba&Z=F(?a({h64yt-$Dr=fCmcr5fjjtV(76u?kvf-*fsChf4aj$ z$ekkJ+NePQBIs0L&|iVm95_rs*A#*xDztR)ABsUH8L|>Y3FrxGrJMm#B^*>O&n8Hp zlu{860v8ktR60nVk`eR;j2VaVR07(CU#x7|K3^ zkOlNxGU17_f(8Qckcf!IXC>0-t~?#YgG1+vfn``mv55jGCm|w40BP`XzNf%|wUkAm z;ovRk3#TE#L-2ul?rf?sRh%lBk}60QCK81uNjNFML1k!ac>csvc;duTn15Q<``vr4 zD6TkhtVra~G$pB_e>vQUEX*t#eqtB~%Tg7m28$|s2TN*t2NQi$-W!~HYj92KV6yjo zR8&z`{G+#SefIU0J-VzIjP-tccYSKW@S$+`wT&d1c=ze#zdB^M0t(LX+Z)5PN+JfWLz zXkK(fF zE!lr@POp7eW~P31t$w_vzWlsRIX$9t#l_!vWTPIxa@_~xF3;)Le(^Ke(zH=uHnIJ9 z+27~%%FL~m_U>)Eu{HP9Wrw@jLj!)A_LrmsKRGHKBF zA>Q`iE&cjGuh!FNYHO}i9?`eYI{QLgm(#DT zzHC_ceXDfRwXd!`uSZvoD|&0!nH%(Xe)8;==bCc*L`KW|zx!tLM_b?iSRUD?H@{G@ zciX;h-9Vc6|M5Q_)~}cB-mzv!PPdLonC(M1=qo9N^4 zy{e+2i@&s4M-Pm6=y*X+@A>nY+Y*fj^rdC5-Tu+&f7fHpl?^WxF4zBE{q4thFY3{Y zK3-5=U457C-?FExYj%(B`uZ(T?7nij-to-0$l2{Zx*1<_!P+Tn^q)EhKFSZ}H2Pr5 zrk8*7lwNg>*tquQZar^iy!QCOY|F8JMh8}!QM{U5#K znw)-V;FS4W9^0zdUBBYx^oGOwqEWYv|CfbZ^vWx1SC0AZVg2EOhp%40Xovnr(LD!K z4;|9kH-;X)?Ywn*?2)F*N%!aUwiW+%y2?GP$A0_9E?Lm6ADB?S@SNsn^uPYT{_8i&1f2XGJOnz?Kq(i!G z$nx>uUHx-?M%VeTrMeI6R6+Un$t7(qrgi?DHg|1kG$1mmlFr+TDi{rNlU>{rMvu(j6qFUVrN0;Q^9Q7xeAZFDxs} zz|vH~i6sLx_5vPFdHTHL< zmyEfg`qRdq2V(=j$k@rGecIN+Wa*1TO1BRg^L%yv7qo{@_*=T`qSAZ-W#=@v&aE{q z2AEgdOTyZ=1({l4Z*W=?JiqYQx&H}#vvF5EY@XEMfSU>p{PnyOPatLm0qcU?{2+RyO3&s5*e?LNP) z%a}IUwdym@H)d)x=_ZqzTiXy^wz>84i8Ek&%i@bpE@)YNBFN952cggC4X|&!FA20a z#=cPP;%xY`{GFRZ0ZZHD&3R~b3^N2t^ZP>P8N8dg+6~xYpYodk53l)iD*7y z-!(U7O}eRVDkP+;uGh!w{^sa8HQ{UX4&p0IO|Gwwdz=|&?&DGZ>$}T-g)AJi! zD;I{=HW0>2Tq|oy?%Ojy94Mm3q8*B^{ zER2MaEO45^5~^LqLQJuCnig!BXI^`aHT4b=&LYkQ706o3sg_|-jOAItpduRD$XX^` zpk;)Nc0RO%J6W)y#vy~4gcJphASo^&dnpN*-gxZ6rd~vjQEQ+v6cFT=({nO46w3f( zt)m9;f>1!=5=rl%QUH((%p>On@)0Fe0K_!}mP)5h6kk_wDW)g{fCB3c^T_#z0r4z$ z-a{o)hS>7RKw|?QNJIpfNH=p!qKg?#m6Wv5&ZYV=M@I*rc z4cCyr0ej>Ei%Eeqo){V#tu?bi?jWQghg3BXmqQ0>I4)+Jdw)3lbFdsw$(KlHw3Q}g z5s46I_M8Kwu|O_lSh$GKJP;cqBTf0m?G=<*eohc@&NZ~o(GWXg3w3dafz%pcr2&nI z7NKwiOM?VAp+yasB)Tck+A?gY#T3YjMMwn)HUMS-t4C1RRR{^D0~?qK8wU-Q5eJ9d zjZakw%?b?d3IS|_L(DAG)KLP^g^UmblgAK@%cl611_%{_3Xs5CM1g<-hv63Erw^en zqEP`r2W*f;P)LVhq9wN)vi5>V%RF%nGfKN$2qO-FhmHvNBhU)qm$U+V%}@Xh&k~Bb zw$NzjOz4CcG?0KJXFxsVXLcvLTx7L{CKF%^=RqK`5TOjD4k4N*GrD7$!`F8?OAs~~ z85=+tX8~J;I1OQRh={_5;U!rZ96_+N*clXZDvV{;0Pzg%X5!l`3^e5VYE1iJeOSs@6_ z+9iqjSDQS8ATB^sN0A6LFaXja#yOC#X@D9GIwO=>%tS;elt>ZvzznUi)IkW0%p%Gy zVhq}BQAD8*;J??@0{Ngn4Ts}ENd#>Jz(c|z0hnYLaAT3MUSeWESFa-&S_MJ?@&btf zYa^hh@W4DV{>=IL+V@LaoxwIVFn^E(0nv>>N_1o#w=fHXrj${UkdI(Yp)2cy+L0}jKr84k;!($~~AT0){8W=-BK7;_(HXLGe zSQ!}$#2zBCawvEVRm;_B`4(#uY0V2YF0{SK}lnYu8dz-+>y;c$9 z&>f%}q)3Ex2&50VA_L=~GQ@zT-qNtWHeXW1;ReGXE`yd1sD~-UEJ6aBCJZq!P7NT3 z|I3Ekz{|Rzc@n}41fN*IF5o#R28=%8kq$+}TLTt@UU=Bdq$7bwp_eegA+XpvU={$) z@RTnmnlx-lJ_-Rg1RO!oSK@*YHmZnv{XS>{lGwmDqCo^gV&WeJur(tTX>5W}%K-HK zS{)(|f=ziKq=n7ZAx$L@O>ol%F$C1V2uWy>^byk_VL&xMNDmxsOh_c`Y{KhlcftlL z1GtZ2NEs+fB!wg)#HI+Q1#&jY(qwtE;?iUySsII#R>a}R04J6q$zg?~OXZ_jL8+`dRi1fg#^g)Z=2fa3oE2ZisK4qzYzSPP{a+*=d$&4%K?z(OkC%+N`>7{) zyt+|6@QeL)*o`^0>q|TKWPf*$x@lbYq4J)bI`H_hi~drxUQNjU&#bo&=hW1VqYfTl z`G~6gm*IZGg}v(71(`q2`T1V;jcp_Q9sTApW!D|uwdKkU>W!6Kj$|Lpsn@1m`t{YT z_NgDfFhf5aysqMBu9;3A=J%_t49-dR94qm%y;_^GxC4b)W`;|lUYU07Av9s4~ zP|M7T!HXWrscV_lauZ!Q@)#RZr`f<*WZ=wOV-ls);X8%&T3S8wWM6U#2Es{>v*mm*-W- z#&2&u_WQfl(td}s`_|=D`)fy@eX03Ab?k|AE=y0&smAA9SMOcEN!|PQlFAX^%BhVb zzkBb6Elbqt{_Q;a;|1%l=vB#Q)?9FW!Zy`^`)#S7_!jlylH)s`{avrxd~ic*`lO$# zFZe%pzt?bFnau7fJ^#I4H7*_c@}KtQ)auQ59{S4T`&DrMgG1+~kE+XFeTs~{XT2K! zOQ! zdES;Mab7*!{KnH)>WykdMz_DYG^alJ{_87_%z9Ck)K$nq(~hV;dy~(t+Pzr~{n4vE zH;>3Ee9gfJC#80)_R-Jm?+xEnR}b2=v2@VGiv6|hdbOxmeW_W0VNLn{s`ib^-|jp+ zr*3UtaPj0}ThygnrylG)+^ZftYv-e*llQ1$XSwX=wK+9(*m?II`04}dXFvG2ZHY5- zYW?gt@Jr$rRkE$Mc6^jqH^2Hq`LCk~6*~71i$=ZEtM1zJ#ap(L+f-ZQoa-A$<<-?w z<-*%fY*X*Ox2CmXe6QLz@#U%s10PdMSMS>Klby%ak;N-hk6v@9`txO969a*XN+xazitJ?qT!RZE`S7gY_A^in`hPKuk4dz*)etauI^onzITs$ zn{6MuWOYs*eBkz3HSJqfS>v*IZ`;zV!t>5p_MM+^QB|`R?al879we%EOsZ&a)os_$ zZnxL@W(}H?tQeU{B{FU#JgOv8XJ+TNW^8??J=>Ns9ofv>mSlM>nHUM`lbzq{N1j!< zNTf>JGYfpdzJI30Wvp)p`6`RC(o|^^EKMd(F6r-^e6w%K77WWebyLSD*bM~NeRh?d zRn=8jjViBBR#dU_vRG9iv1FpMv+`Y1c_2NieC+?*Q(RRw$RU&g(>&^xbTvQ+36)lj z{y0EIV2Vj*E4#sV>`9^Slk2K6AmgP4Ok%|kaCm6MDTlF7bXWkl=#W>|f$dn(?T@C@ zb8cbvNPuV3$Fn)zkYXcF16y=9q4FD}dd~f4XNSPqz)y2_s%ii1>v+5}H@Nbp!J`hH zJN8p{i;v`6xwpQuP(bSZxovZ5b*l#E)fP!u+di+U7T7qi!)I&T=6`XYC@LRj0R+aYTDW63u+)iz~Xbkm&L25+--+b*x44(6@% zC!Si+I{##mpKJ%EPcIhO7w$6(E!Nmqs+phl|4?Y8o7z9U(gK_Mic4o&vVM+l`-k

TFkNCH9Te3RS(mn+mlCCS}c*Eyz&q=lTyg}XEjzY(w zS^xK#R|6|Pn>1J75KB!yMbitL+iE&}TRRA24X%~7q-M0~$TYbcICmCK$Px~Wo`nd0 g1cB$tP>P`Ngbe5YD8e|x+@2r1xnC?c6@H@n8-nHgtpET3 literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00004-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00004-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5b2b06d80b7dfdb359eb142e872e99d27df69fe2 GIT binary patch literal 4710 zcmcgweR!1Bm7fVAL<|%tlO`GNvgD}0!OH`wK?r(AAL161Y@<6R%%8u z)=YZJVkZeVmU!o4e$>St^O)Vxz=**Flh}LCBTkIP!5OKe2z#Mjpydg8-B2;U2;Q6i)0xARNb2qC_XsP7}v?2t+#W0z+PF%{37L zMbeRJ^AerfBhBJCxX55CX)q*^fGMTYNWl@B25}rAh6@pEggM3`@;*pp5QImDAc^xf z+L2gBjK)w6W05w_Yr?EHSb|T)z@{S()<_wLAR`$Qi5=jhjS${pk0RX&VZ4jIh$JEa zgbo-glMyluvOFoIj??VQ6~!#2do?vPc+utP@-KVG!`*-lFHPYdO!iPCJ{%*N0M{ch6v6) z5(o>X8R3nYWT%9p;wS`2m=kTWq{3TnBWb+FTKdL$i4KF9r-)f_mxjk$3gnPw!RiRJ z*yiyTIL26NZLNq50YHsLS_2;7;2donm(07!5TKxlMegh=OlpL12+SFZjcz=-*8n60 z7Q>rs5!VjqDT{n^OGV%;I7TX8sedEc37J70!vmGh;)ptnIPx60&qPFooAyeg;O;{e1rq^FG%hIWhl}3#)(B85wnupz=@3_ z8W8e0co8(v9%G7v%+%jsG{Q5=uqHt5YLREoBI}0EW`xSa{G<1f+t_!~#fF zN_o6=*$iueA9+OCLaSn+J%~223VIVLsEDL+8w9bUr!PAJ1b(`Q;b8Cw3PmYM zYoNx_xc^6e%d8D#=9_)8Uw`w5k~*!H6wVL965|EjS2WzYL(E{}8R#mPsp3>=s%&zq zC{>(D6qhC8d;rIj0ja_Hb4u}%b4qdkYDM9H;WSZNb>uLS$e(1&QUm^Uq~TexEE#-c z85S#2RmT=fstSu`b%n)5kI6^gibBoiN2CHwY*FZ+_cCT*>}@xtMEt5xpuQ{P*XQy*Q}y6FCI|5goh!v?ON z^QxL~@5_VFzkP*TZ!USV-=*E^#IdKWd~okNb>^<-XP;vX_0i{hF8^J_LG|K<&HG+jxK^z@x&5s@ zf5@p#;`bBund?>iuC?!-uqmgm{fC=BZeF}rP5dlWr(ylNPns6G5qGJJ(f)sXWO+_K zj9)zQN1r~ap08YX`|`vAb@-EuF8R$1cc`hm=WQ6YH>Wh72BRO7aczw+$|a%%IZPfh;e=DXE|-SqYH_M9pJ zHl+B&0m(DQZuxNR4eQkCvWKcWf0hzIs93J=PjcWc)#l_dJ=vJ*$_CK-llzUXg{DEwH zb53REhanSpJf}wZF46L<{p$SbGrQQ!dsOSilODYFlK)b>Z+arpIp%Gp_I>wopX-04 z4)<$+{+E;2s_HFQTtflr*T((tJG&oTrA|2Sxj)RQ=~k;gJJf${>juTw4mk6(89DXq zXwo&$U2&IMH+kox%=dEYiJLxN*zZ4fse!APuIt>nPxT*Hh2H+;9#uW&@WzYg{;=uSnq?`?W!Yfde${eE%vp)IOo?8BdLN&qrrtCGq0-=DMU2efPGZE8fuzR*{7 ztGyM^n)B*bEBWI7U8~Ex)uVFzNsr}zqR4-qHtfdQZZ-PEj_2x@uTWj;v6Z(1fwu3u z_TL9J-=vyxMf1>bqgoF#o%Q%I`dN z#l++f)cU6`fBMj(ZuRiX7vJB7H>vI4eRsv%bve~M{a{<>>6PlTTdr`w#ocPh9qYR0 zTz`++yZx3!-)e-xH#&bZ~ke{a589UG%W^3afE=SiOp-SPdQ`_;;CT>I*->vpQ2_8z+7 z(w~FZu*)ZG1Fs=x^iDo@))VJcm#0>!{_p;A%Ehz0)zS~%dY|svs($^{-ldC29aL*J z{$lve&#zWLefNWZesEy7N)=T;c79o=RkzKbov~N=W(_KmDjQi;QIvKg;Yp*YVqkVb zYub)aXR>W+)1FN)Xi1eOQbi*{PqK?!{m7yDFGUr_ne=yj-o8(|#igy!fFPA5qT-6; zCU8v^9dYU7n|!lx$>t49uGCHKUt-r6LigB}XDh2JFB?@_l`5-@N=p)z`NUF1<=OHZ zN%`iQQKe)4-yXs$t3cqO44BVRTWhKSI!LIvvi5j@3cwVS%$9e8?U*Cd+ati0bs)^e zc}x<8HwgH^DMte)zBFJS+(JNJ-T<~^K#9MY&VhpWNPy@3}b>eGSYIHQ9$mA#VtMbYYVv%mX{Z4vw%cq?T}?>)CYPyc zxb&);7Tug*N;|mxC$n_RT=?G7-qfT`6Rd-yud6v0v!>yyB}?jhe2?RKPpF=q+g*NJ z+oUtFYt3=VH#gNcrCW5<-1K3%;I}XkI`}dp$toVA;%vzGDxZo&FM>n@s zU+LR2AdJ>Y literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00005-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00005-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4d8d74d4c657e583a1967d931d37f713fc43d19e GIT binary patch literal 4806 zcmcgwdw7)9nV$(E4448%CQl*}LnMecOuzG;%XdypJ;fq;E7B}ZDHP#cF?4bvnS_Z1 zt95DB6~&@pAK(GhR;a9CL#^0?^|Ia4+WP3?1-IDNS}r!3(DhQR`nc}|h;~ue{b%Qo z$$aO$=RNQ5eSh!ooG-JcHlSoI`S&1s%Y@owC;Ti)CMwGj4Z2&K8tsA;pSn8lDMcy@ zcof<|5vx?dJjbC!iebTm6HCm=6$|R>Q0T*eV&Mhy5%ttk9XV|x`&5qzKA^)vAs>*}~bD8eN4QN*w@m{BFHK;RJE&phF~Qz&6!vkrwpV11-K(-><2jv|8~9*?ym8qFLY z&uQvBz`#@(>JW1Xd8G&qY{0-7PG1o3{@>D_UL4E9mL~9!fxsIPT8@Y?H|4FMzws>C5YA6eKary}Zj@Ja@b;P(ld&>jn)xkDzH#p6JQfWu-0;jC{Q6s98l*dbHs9MU|$Qv%6Sz8Mo3$@S&dcX?j0I-V)G2RPKp=5wE68LK~ zQWwOthKWcRA`~(z(3ZGR@hAug5Z_uh_le>h^3Yi!qQG{68E{_%`CVWg4I%;T=^v5{ zLZz{TW(YzU2PhGS+Bkp)E4T)l<~Z{n(i{>DG-AMLsv_n>pfx3qhUo{3W@8VoIUqHV zk-#EijUeWFK2~`H1!@2<00-fK1(-WV1ro{{qL8QF+YopN%qRsqVfEk~5ADK)#z;W6 zwc`LzaTKP<4XMH*Qk*anxues#78JuzY7ykqMhrojx_!Wz;2Oq5L?FKktI$ENDUZO5 z5|G_2^PmqY+J-l?56Q4>flL6T@L3wALx88nKZ7gGE3Y zpd4mS;0WSEQ)dTIAWa!W6%dUfnpW5(02yUO2nWJNF;G4f0R_<#BDEzRGt2wymE|f7 zcz}$CVaorleljPaf}XHYrR3rijg2+hfa*-ReHMg5fPs`kcSPEOx_K+o?eQG3fIPGW z;6^+GS6EmIWu-jS$VBPk@opM95LFI57b1kfp#22&H;RG?dJ(JSMxa%>4r#zR8#yL1 z%RrhUuxIMEF_S(JFUD%*^NEF4TM9lvkX&GJ zD4?xE2gEb!6+^OyfX*X0206i?gHCG^AOaL&KzlxSOB^ISfJiYApMi=Dt*=$adJP&5 zN0m))fzt!x7fNAH87?-ga*JAdI14M6v7@F1)%3V0^vc3CySGbWa;I}qGWL_R$Lkf zcED9;baHIrPE&mRPE%YMEg$$ExOpThk6%1uglf7{;FG-eEmM7$$E$833qFqjluHQT9 zK6%H)R~D^anU`|vMLX?jkH{Cly0mfio?a>D&3oj|75k)Zc($7E{m#oAK$qwHRC>c&%M*jSN=<0UiFPDv-_v-ly@|AJTq}=pZw7i4|J_u+aqr}=U1~v zuI!UXd)_>M{Nhz|VMEo(?cM$Io=u}??)>IG@~t<=|Frm*dHKTlcTQVBf3+Nb&Ze_Y zcl~nvrf65@scrJj6&biH=4JNvDYM(Zyh-k@+Wo7^@8spKS6;b$*DHNWE-xoY#OimzOBNWMSg z>9>Nm&GNd_Rv-P{;kt!Ki%&lS^{%W*UG>}T@(T}~dFB~I@^Z@VyK*DCAC>bC_wKlMbg#_Z@cOpjk6R;O zHFp(FxT0UKc=+!(W!~E;>(ASTgMiDSFU^Fa(*n+FRxnj)X}q#td%p@ zb^YO?FZN5*@z@1lefLp0`2Kl#s()Qx$ez~_Yb!>gKR#x10LHU=a_sfc%dd{@-zc2Uf>w4s} zZFxC|EL^|kix0_;x7&MPi~8gxJI!gwqo54Yj()ff3yCU&-eAoE5CA9 z&$GiGl+PvCZyoykK3Vs|%w=6ocgvMyF1vd2{=D4Z{9o)RFYS^Oe>9yy3mskZu6 zkAMAsxsSj5Jiat9>#F*O?(BI`9-DXJH*eb8C%3S~XAaG|SFY*){B27f$;;y%{K0p! zsN!cAm$tX6wr?zGH&?qR18)qVnSQ#%o-`epq9(Z8WYOM64u6i_42Muryh8e917EaZRo{TQDr? zRGE&Cup0rc2kpu_D=I6#J}FU|EUlo4l2}C{uw+qLXW1KJ*|*Q0l&JrId&#S)1W^PS zkiAK_o?Qv4gMf-FCZC$1fy4|1nJw!E+xp{jJt*20wIJrjg`C6&{(!@qC!TN^`$&g{ z|rcv*f_*}+khZmy~Sm^0%;@s|B|T3G==`ME7^i&Cmp0rOG=Buuq0$)tddOFCRO z)z-2!)#S3us;nwsIdoZdfp67y)xg9KldZ0;_KhxM+pFv5Evs%;O@*mO3&S6nH8wAT z=jM(~M(GU9!$MbEebQ%j-LmD&YX{e!!t+5_gFAQF!nP)(+F{q~Q-W{Gq%w`oDzhk+ zb|%}>cIC9$VBXsGr4tKUyN-kWXgh#DdjR0zcAo%r0OKH3-IaC!0Mx5Y`^O_KuxSun zbz^hZEpl!Du>KPhb)vxsJq*UH+S*hX44-K7<7htQ-?TJmRbzAeOejco-9U`XKXrMI ztHr0?)V6dKDh|c^zwbF8SnU3@GAdFQg#ZzI`WZlu2u~qQy q3-4VVURi{rB2W}N9SOMp!dsnZ2E)xBdBn`Qv2PEJ#b&{Ou>Kp9tLGK~ literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00006-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00006-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d1626ce0c1527e11f1d53e5637683d5fbc850552 GIT binary patch literal 4714 zcmcgwdw7)9nV%U#5-{9sCOc>rA&>}?m8&G6qY*|avO-iSjD)tBJiQpIx^m9kEjhANUV#9P&3CdH`;JwS-_YMET(~F zIG|B5d?TI?`b%9QKUm zZEzT3r2<5pFkA`hQJ^^yT9E(~=W&|EI;kMwi8&Nts7l~OBm&JM9z@D37J-{i5(-Tt z9+5~ylv;{IN?g7mUPK1l^o_C3AVdO(#KI*-h>Mg9BDkiFCkQf24~S=@F!BK<0ilKm z5Gc1ufk)=8L6|na8tV$Fp-P7ybI4zD8c~f&ps<3j2*JV`VDrCqHJ`TNm_-4%P^ySg z&XCa<8}R3FsJIrrrD4qI$k8bD&LeJ(g`7R$MJr~JqhQlyEeRF$GxCf$E<&TVu@-_S z7{&V~Nz8opU~G2Xd-n5TGwRXw0}zm*P!y<)rQQlH5Jp%>*eGk@PwC{1HPicbg%GQ7 zDh#;}Fb@bo1BU^l&~Sqz?2@GBfmmk*^<|KY0%!$*59)c~4FoAP;gN?*cWP`*pcQnP zA{|0sMaZ!wkrE=XG)zxPbaD|0ZLx8Yu$*8=xxrowg#bR0@ag^WPKUJvD}$g$jyt3y zVj>3+0e)ESq3d0aFo>R5=&296V?GK4;iysG1AszK2GmT5cL_(V0yrRumzl_8C}2nw z4>4oNMhZ~a8U_GksEac!0wW`+5>7*MI`Fm;q9LJ079k}xFqG-YDjx_Yhz5RXVH$V! zt2am~A9*UUp%zL`8KvsSrG|%=Mo^a<3^kEZ!WyRl;-L+Yfyg}IlGoiE3jki7)&SYH z2oW|Y5?nh4#U~!y&=`VfV^<<8c*L3au(NWG+7OB#0Zc7{``FJvzotFj30ebSkDT@n z@P@bo{DJ~R#5$^|gN%n7;FsHQH3}#o%7!iq5efGe4Rs6?>qJcyG9NQOX1=prSccib}&9uGF1oZ|?Vd4ob07-vAvGzgDzJ|NtYMqqd7 z6^G6XuQW3(gmx zY$7m0A;^tSPl!(pG$;)L+7nE@;RtPL9>aqG1K0)A>Gg$UHM3YC0pua?OdGFY7yv>6 z;3$Bagz5Hpq%_Ecjl8g)hG^8^p&)P$sA(OFoCk}cNP!T?Z=cCTXc=L`Aq(YqASDJ1 zo-Q9y;t)4=Yxk}x1xx^4Moldj+A|S^QR9Sov2{wZVjV^5K&j7KK$SQXH2q$p1X$wVdFUEY6k#He3s3`RfHlxY9ErT#gaU?-w%P-|EMg(x zL;xHOj7O-aPj4=2r3kIWahSaBy|wq$kpf>lT4^=sB2cd--C;HIrwh8}vWJ)BSLq7*UP0j4i0&CEu!gXTi629+k_6?;k*J_<@{LqqlZ_Hz&VaU)j8_ZH2t=-|j7) z+tef1Ts!EA3+h+N=$;D`4}7IZ=Jp)idhdig<;kXr5A|QtBd@%C)ZykKKbAZ9eK_gd zr@LiZe00@07d|5AR`0p|&RIG6;*f?T^ABy8MHB9QW~k|wzqw+^4Fg_YFDLyln;G)E zoSd@m@4JQ#S|!Ua{oCv+s7J1Re&fYuYh;R5f4kx09=USJE!­ju<)anrP#VMpbz zeY+mtddEt6`M629-e!8_C2sfP@jqe(BnEce`f-^7bc4$JOS z2l^BGuB?nrIDXH{H|3kBK6vr@kvnAni5sd8eZ5=8Uz+evL+W8U{YyuOE&OIqURC%| z&%2p7<*&QGymast@5)g#k5AjtwM{PU*qhEA>z4N&c%|~m=8f{i`x`GA^LkF+({=yP z_uu)jWT~priCc2=(A#f5b+U1ljEx$0-dL#Di6IZZJ>=9{x%g0GanZ;gd0y+B< z11J9b`HzF$sGB#VC+?TW8@4Z~0)NkMp1N-7Z+{|R_-*>=qwgM-|Mb4RwVgdA-;2HV z!qOk*hYlCq7|~Gt}VR@@bv4gqU+p6-;;~O5lw%S z?U5IuwKv}V#>4WX`t(1i+j8TvW42G+AaQ)xQ^$|wWbTzczl%0MD&POFf7#ZQ%*pee+&}Q=kI7kI zx@OaL&v(lXv~bGZJ@?B+RnN{^Qrshb{p(vszqV1<&aHTE(XVncwR%OzGcSH$eskaV zU%P5$P9_t>pPX9K-m2Q>&Tcn1xMl?+lPsxBq!Jlh2~Qh|)UfQl){L2)Y0tK0bVoKb zuO(R=OC~BoZnE=RUFC@Umqe88U}-XOdP$LMa?P$K zn>Q@HSv7ThhTTAL-Dg*p9bP{C8&$>S$&%ruxG*+6A6PO`mMyz2Ec?T#s^angZx3R_ z%R&Dj4QOB0KU9=M=^&tj;bT8tpx(mt2AM7E0^9MYmAOx!hgXAc7vyUa>-_+SC(k;J zeWt^FaeE!ovRbem4?_LPaCVN)hf@jVnfmEy&NigC5oZhA>ugflWmP*z|E05&;B4UM zIXly|zw~uDUiOP2Wls*NIyrj$=ZuF>#9Q|4rDgd9Qsd^e&8bnX3XoUR3&NWAo0@6> zjW>0;Y)#v|TWXqJ7G`f&rIzf9M z)0AmZO>=6}&SdAcT|Idwn77Wq^vr_R`KOcoY&%GOW^aOh+kHWzy*c)!s^@3jpC#&5 zQ~T#Pse7Jmu4800KZR|aj}2wAB#~ Dd3M*= literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00007-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00007-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8bd1a75a11927f903537afe0bd656db5557125ac GIT binary patch literal 4710 zcmcgwdvui5wVw$A1{;AY6T>AfkQxMx={e^+kMG2kBhTxjDyDY4PzBFZhUR55q(OqH zRI4njSZPryr96D$6Y+IhU4^Qw@~mwsF1WT@m$p^RBnqi6y|#DH0I4nN?fvIY)|zi- zzJ2!D`?r7lx6hZElN)e4k^TUsUmjDF?tqVj(y7V{%urWrQ`XHr`l_o7PB22Ov(9ac&LC+DHiX5F z#)bw)5r&|c(~z6Rh7-vH(_oX_K9BGqne>#$C|KAij;-avXe(T1exg$&tdWZ#wn$U# ziFDRr98nz;Pg#^1lI${sa}rz0F*lew>vfQnILZSt#7o<-Ceam3i2{>ULrfkCLnzcE zSgMIO(zyV!RC?ngYX}**`fMCL@({E%;0OMsmSB-%6iXZhh>gb5l0YLTqbQCzi;XZ} znUt{SPd{rel+xfx>PTb8f_KRsRB$W^=J0N~M43s+E*AmjLIh)o6&h&*n=l^2#@m3Y z;TlIwaOyRq%;C(a0V&4VZKGUA;4A5%~5|ESue+5N44kC^{U`&DN0B_83 zEMp2FMaUxL^g~F<81Qo*Qxus3v!LGy;Q|XTB4IRzyD}2kTZ$l6jvyH51jrZ(D`OuV zjXa7Xu+)KB1T94|*o=VUa~~!@HPb-qA^;zt`v4%IDYGWiGoX_~ev!~RLcjp20|ER4 zW58a}2qXaBOL^=Fh=$}b5m5{Tg>;}N=-7r*foIPu?bP0A&n26vlgAUkK0^0=Ougf8|xrGXy(lClARHYt>9Kl8*>mX%a9k|NfcAf5sl{zNjCC`L9cltn1+?q zAx*t-5|Gn=_ARC3Efz6w5jp@TdA*1tM}c-ABT{=OGJh=Tl+egB7=kbhL{N^I7KCXG zN(OcIa#HOX1EFUv#7HzZ2oyy>4q{9NrC9hFA)VPZFa;ywr1`0Y9s9Rw#}5S1gqeb2 zi=zm{CjcNZFa*NBe%%Nzcoflw0`i*2poAE*hWbG#d*~b3BZAUcDhrYd4IVhim;uZj z8oDG!5hx0Up#mi3EXW~=?8J+PbTVj=NNdV0u^uFz#$aV4O@J?wIq2q28b^S!A?}osVf0K(KhB&hsD|Xjs84ur8PgECDA@ zBOOBZ!wo}{oiKhhXObWUiVaFBW2^%LNkxvr5X+c>oiHIlS_sC_@F4#N$OVHTf-cY= zq&mYAXK93>_Z3av zMljjX&}&ll(pbb@5X3PagXpJA)2VdXSJNfw(nO-PED2`@II;{+k0_j7N{^gfN(;YM z6yJ*{j8x^314g27x+zN!|NAkAcYx)f5l3#r&5Cs8v73V`i#N+^iZ>JeCO;UK{_BY~ z>9>=`lhUBdiqxUIm-fA}qDNJvz*+HC+;!*R5yRoz=#36fu0r2_lbtuHNC+57*nXxhRaHEsQmzj2Y< zsCM;Df9of+^6Jf9Pu_93d9$+P-fSHEy>9i=@~Kx3yJdqqv~5&s^iz5D%x&$XdoNn9 z+HC2P*2O()#Z7;FW7O|gsm(iI95ZZek81kXlABxj26a;^9(?CBc~y*e{(wa9q-9IX z*Ic)4nL3uv2k#x4@$>JkR+BIPMR)e(9(Bse6U%;f>vmPQsIPwb58qV&w)a;Z`0|fb z$C+bJzjkQPl*Oht)#{?8+PwIr9`(l7cenns_c`^Q3zrVtHQ=Dyedv)D z{3}~j*?_&XAL~6h<*Y>S1-<*Hzx>mN`;}gI=9Inj^Xl~%elSa{d`wL}ZTE_?mTFNi<<+R?uX|$6wkK5Ay%!Ft z`S)(svFlIuxqI$VtG8Cn*;Uu0#J}x+_qoI0SG6m%ug;p;qk1=fna?@>=W2R7*EW8^ zLDeyS*`~+)9#il9WKZ^-wcV(zN~bJJnkQZ+m<#?@?2#Um3f6(^mB;zwy0Qw;xo$>T9|`@xpqweoVISk>~Pi z@_%33m3ZYLRsX7Ze^^ysec1ifh3lq2p(d9MdGfqVx>enyn@j(eT%(ql1A}`9_o(le z*W9&q_XBF>q|3IQHZ`y2Jbmw;d;hdfUG?qs((8YfSJ!;*=0AM*!Od#<)qAEkFY8vT zKAiNQF1L~u(xBAX^Zd1KC{H8wF+M|}D z2V0-K=f~>3`o23>^ySs)#N_I4pZl=-YtIAEUNS1LZkk~_Tg*Lb<$om1< zcegyP9$2^JywAP+wi^8b`NDZ!Yt`HrYc4qR&v|v)j-6Z9Zr`IiPTYO#%JcuMUOIVm z!(DY-)d^>Q?VGbU9aOi!OqaY+cc0>qmEHHXuDnW@jC|(ovi4Tp_RYENcAjt2pcv`0 z>XM3*tgD76jgpGb=UQ5`c3QSQ*OoO4a@m&VbSjZ9sRr%HEo}AGUo3nnsVHsFUgZn+ zgR{*pYkfP2&!7Y=ttf4TrRkC*O9uN!-{hNf1;ardx^clL*qs2b`|Zj*MpllzbX=-3 zT{e=X1|>!o0!x>aca$F{<@>6}rRx9R9>PXef|x-XAO_=pUsVavK|rM=#~%++5tw3- zx$-Wstv@2O{bD<^21K~DfJvfw0fz^^bkt$u6CD=7Ejr}obzoZ$s{8S9R-9T`T@CP@ zeS9>>3Q{b@F<^_%rj*YZx8l@)c6I=qo$zVSju!2oeZ8G5&krl#H+0;AQ|muP>pvE6 z`JQRzg#;=lw6tAOt6McNueM0S+V(3OYk`edF7Ub9ww7!!l3tb?EbZkBDn0=_pdXl&G`5$?l6UsH9=XI0&z z#fxkD*B;07epmfFFK~rzZKH07U8{}@zNxXcG25&guc*y$N`m~F|ymjH!qYGLW z9!c_(?I87wiwX8`_Zf*6bL>ymEX?_TNHpli_D|2Wz^4A>sFe}!-RhxMNc)X@g- z_s}1&ZfnyE;pd}Gek7ZZ`L`|2Ih}28zW@qSRacC0@xNT2BWm%lZt7YV6e5WXyQ8AlvB1z?qT|c(8=$Q4h~P8XiaB e&>0H~|I~rQaTEd^?J&0!5;qJ?Bxb^YnEnSXht%5u literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00008-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00008-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b70a199a9accb2118c25ee116f2b363ebe462cb2 GIT binary patch literal 4701 zcmcgwe|S{YnVw085ZQncnV2kDAfaR_Nf3G}bbk6Wy@aZns(NPvX( zoFzU{mct63bzsv`lISl#iSR04%<>Ia+EhxMrflM!aYN2?MAQV-R4Va5#HIq*YL7n2#}| z289YafCh!agvo8-OD)LO#SL zLS_`kZgOowkVTOny9EOZmSRurcpNO?8=7Gj039uB{ zjhs`H|8L&3h*>tg)dJ_KFkB&+?l@D<4=o9mQ^pSkO3@g=fMem@^Rom9=vWh}srFe2^c5TEap$6dF7_hA4upv|_U% zBZEO0gm6$cp`i;x>?mWLi=2RHo74kp2#_d%KI6XS9e&KC9cM=YL2#&qc93$bwVDTl zf!a8a0|>}MIuIEk=^XciNr~oY8xf#E3Ro$ya1^x0AOv-U8emO*hm(gQ4UQlm=xGu0 zJmiCP0CPwntPf;R&)l%Fpfx~&RoFwuAn4o>gn<&kh(cR2rA=~Qe#{eH`L`(tH$a$t zZ9VOI*!qe(hlBv2+cfrlC=~~lW1;8K5X1ml)M`RZ0M!Npw`r;c1=4y}P!96Mu&1%c zAwy0GLIUWdIR?;SSTheuN&r|){KnGmYP_^$H zr9I0q2(XzAO{9V6iig(sDK-qWl@sNMA#79@n#e#CMMV1+s~~HpsDe!nc05FU#&y7e zRVoQurx0v~fsX-U0pWNUz=qArLhFKKFIRhy5Q~85Ccv=$0i%p#7-Ww#Xl#U%a$qY+ zG~maepREEF#ZYEroiKi08#)kw~m43P%7qffUDw<&Gk;zN1Jicek|nyZ69QSl)MDh~y3{ zMe*W4pITTNEDMJ94Z~n*y!`xNL3!_BQBCh)q~GL~A@P@n)WnZQ!Mq?|P+nR%uj(zm zVpWeUEnI#(yXN>e?vuT%J~nVzF?b*OO&pE{ z@dbI`=dP)7*W53s9(w4NL(6*P#Mrh!cCOwlm)tRSu1r@zk1 zVOO-2H(&a?TwMR$qH7m-$^VI+T7IH_w@kg?@$#m0x8(lk*PhT%$*-Ng_`s&sC*`19 z634sldq9p{_5D-pZtjtT##g;|&+2cQWVt$mGA>l{)cwR_-}6bo${t_sdJCEgiS=mv73+S6;WPcHkbl?#$k;rSrSx zpr2p#t=k@0FU8I~-o0gXk9=)->-6c*z9ujE?e{u6H*`t$!x#4OXR`ABB{%-}ilT?* ziA95!@0_w#zH!sx#F3vCWUl$u~{C^}x!k>@-6Q&OCfj4tweJdtVrLOb%%~x?|Wo zo27g*x@Y_SS-JW8t5Y|$?UgU?eHAbEx@GdE(X$rLT`yO4EnY^(^hor*{au}Qn>_Hu zky*Qftjydk?m0N?t8)L9d(EQH^vGA=9>4O&8{d$#$~X4hH1`+MWSso;6{M+&p3M8?|j&IkEHMA6!1_VL5Nto#qcu zXXTE>Q_1Mr?Xu)k54^dcI4ip%xBmWE{v-13k@w#J-5puk^2y|iBjXOrYdQ~29FjUF zuXty`Z|;6~yL^1(9qs=C_SxWxp}~~((hKYPF^K=lw;pWlIKEYWN>!b>Q)Oj5uk72` z6*VtZ>Cet@Hn+P*1tQuNA z#BLC{?zbyRmz9^@QdL+UFDj#j1(C8`VDY??bje0k^1F(v!U_MhXRoqy5Isl(l&tE# zigG9&1QaW)zOX>Oh3O44Q_=>u6Z+)1UzE#gK&)fAnnZekz~Pz7a}FaP>M&Q_UWcrt z7HlVgI)5;nU88fe$3S_myD*ybfb;@!zOcQ{t}eN@YS-w$c6J7w4f-f&=Ro^wUq_=Q z+b=HJd~wy8(GxymPxwH*CHt-}$t6&F`GRy^LM>FlyhJYv6U~bn62QhqEiRKtFZf)d z(PflX8CAM!z|zVb-zsY>fr%|9Q(03Prd)$~fIg5Vn)5)G*))lip6I+IzD z{>;=lV7_q4lyegnF6m41!|fpTIlT$?FZVHt_U721s$Pk(;S=nldDWM zWn7(0|Hb@|E!4RN@AuFjuS%!Y61aV?$@{YTfPb@~DWg(N%{Krbm9@PwF8jOT=~Iip zcTu~bC5JeG^^dPP4_NW>q?xxYQhNP4nvQEsk6P@~%^-}UP(mc|sA}DkYOtenkAIj8 pc(8&4b^vdId^n#5f)gFU8x(j}gtr**GQ(URxobcqG80}b{RssP*xCR9 literal 0 HcmV?d00001 diff --git a/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00009-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet b/dnet-and-test/src/test/resources/eu/dnetlib/jobs/examples/publications_lda_topics_subset/part-00009-c3abd217-3f3a-4ab9-a993-0cfde5f36081-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..00c3e19c0c10b1a13ca1d73b9dd166eb17a07258 GIT binary patch literal 4630 zcmcIodvsORnZNfY#(+^ClN*vTMF{e;^s?V)?}*((v8Brjl!aMsl*au!kTfshanGWPH)rraY*B9ASTao`k1v|z&_2xDLZC7coyLw-Uq z={X_{ON_uKAEP+3j+rP>M=3Tkq5_%78KWpTt{al<6w(q;V-9(7N?{cuIFW)zmYay9 zsBTdrPXY@}8qO^9oH=*`11n=jt;g7;d9iC4Q2@yz7J`Y43e3cW2^(U9jliBLFGKKn z_U`2M0x2pY_>opdMoK)kZ4z=sQW?cA3h70ek*QAW2z4BB6j0!d8fKJ=L*!-T2nv`% zb~%hOcZ}QMgEL6MfC@@5<2>cq)(=VMjkQ$7n8dJ{pi~CHEDjQ4i7@!9Yfp9(;zBIA zTS3KZ8tLWV<10H25$50SK#&>&(CAR{6Jg;4TV7(y&Z#>N(y zX;`F?kOXj&k~3L1GPMMw2v7hlV{EM_k%i_|gvE$chgP-BzOty2GewbQV^ullf`~Bv zo-tAh=m=+geP=2UK}R+K-JoLtc4KI4p-f&v))EMMWl^J}D#p$$ABl)#kAd95GT|f* zW5+=log4@Nod6`lh)XCGg${7uLTE9GWRyOXxw<(2S0v)WuV?y-@o7MbOhMnpd5Y}I>H zxT6?40NPOU0;MDdP7@IsF9ehn+Ex&*cnmaT!pOSCCG%~J1gz`?gbsuW7BtO}RsT4g zD#>l#?xfk{C5atoKzR<`3WVg5KvFVfaQ)CrJ5|gO!2!C{iO3?0MC>EU11O>q0=szy zLJ5l$unam_aO;2~z+@w0WO-0ff#a!iuMAbnoS6&6_L7m^+YNC74@SC^%>Wa3v9P9>;b2QnM^HA%e0y#hi_u zGTcl*SIkUuYpKF~9<)40!8_$3Ck6uI1O|zLj-@`Pu?O;hDV3K5sN+53z$E1yV<71a zdIS0ikfy+woJS6r43Yr)E`guWbf88N@ZBjN*Z+G-KJW-hMxm;P3eeI302va52N0bl zuudl^giyvCcx{b`jDU6#kiiDf2j);AT*@M$Bs4e^9)YwP0`h4v1z3?Mwh%6hr9m8u zj6nm8fNh9GFbh;b1t8jkR~KjfHM2@5Q3v`gplb=<)+ixnC^QeCX&`RA?(@mK0*MmR zKuvg{n1awiv4MopGSFcb@XZ?xP{C$_6;cLvSi~WH!jN#pfZ~Hmju`2M1eCpjZNMYq z2?sfJQYu9mH0-npStYR~l*ORLf=(}=1Of-@t3VF}kTUO7?1=!igghWMsFm?C4iTdY z#{>bPz7`ZXCLx9m&`>TxvlWs;yl|=v)Rz?kfo1`@0p$_4PwL zedJ)nP=s0nSy_c6&kbQ7N)^i(t8{U?G+p+UbShn(NEDYPVb6yn$MAG{;p|a-;p|ad zcw5>3+kZkRt-f$TNEA*dW$EF6I;My=uq-LRFbsp0>FSGvCDr|dWmEbG6N4s)uSg%g zd`kLQ63k1|CDoOs|MN`M<2yR~bY&?x>%aO}eeBZm;qY|Tqa>NQEmZ`!$C5=qzftYF zYiyUEx%i=-CqC%aiP~f$f6whd*|2U|mtL3Ov24>HdUe&aV;0a&KhsbD@~KI5^T`<~ zadu8}V)ym6r{gVp>bcMDdZnySKeG9erzeIkU3W07*>kQ}56xck>h`y`=%)^S>*UBG zeY$0UCocirm@AT^76PEw1<^T*q6@;9H- zf4_3{jB}?R(^D&tE}lERSEqIzZRmOD2YPEdbs!jtfA)`Sj_I=-j@&zD>p-!>LGh( z)je3XML+n=O`Hzv(|^pLzbkWQmtI=gHR`(id-Z)gUzmS4enQXrmxVw2#nV0ddv8rV zJIX$;C#-(Tz31vJ*BUH`Qg!DE$Pt>+g_<&bmRT{BKlo(e!Awh&vfaM zE1x`4bG}#qyr#Ulap?wq@*A5D+RgKDueq%oo0UO1D1!(CPz+AJL8MaJ2r#UcIn!;p)lZ zNqu(Y{#WbUd-Uh$7HYHm2J-78|GZuA{EyDnFTUQZZ~do{oyVX1p+383B`SKiN9zyp z_M!i^T~B%Y&71%8SH1fD8pzqa?Zd8tR^YcD_a>#7IzgpCafxAy71|2^@Z z@4voZKblzs& zAGYW_k6qSyZE2t0|J@s&zpin;u6_N35kI}PPmhWlM}KwILH*6+;U|wZoYEVIzdd7Z z=gYeL{V%m1?K-6=?YQryjjz0@e_!|J?CzUR>(mQ(EPHMK6Z-8FQ}=}*_2`pjcOKX< ze4n;kT4(jV1*qPZJkUKOyXJ;<`}NKvZF7EqUyr_P>Lq($U9e9NtJg!nc&JCGX5KPv z^YQQL=WiU^uyIwdPN%94%qnYNV%iorw7bU8Y(OE>W#dzosjMFlUk0hl5xLeSSvND= zo@>k6j$F32C0&|Gr^bU;8q|{iWF!pLL-f#HJ*{iz|zpU}`#b zValbUDKv+cT*0tpnQ7|y7`vh1deE+-t*W}})(NH6>9Q(bT9T+L1eQ)!v{iKBio;_k zlujG?0{Z{|XsW6P@q#o!lO`M;TMf`bK*d#)J_=AjF#SR1DmuY-+67r16xXULAiBi` zOcMQnz+rdIfWyScIxK+O?@(0Kg6%X=*$;=aeq3Spcz|ctN29q|kp4nk1h(JVjEXN# zSU>JBoppn=p`YYzplE;T>sYd4@34v=UooM3+_X>7_7BBd@yv{hLIRbOTiX_9%n}34 z%k+~l)4sGR18iK{5ptQf*4s19AqNMGoTPMMidnet-feature-extraction jar + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + org.apache.spark @@ -53,42 +83,36 @@ + + + + + + + + + + + + + + + + + + + + + - org.nd4j - ${nd4j.backend} - - - org.deeplearning4j - deeplearning4j-core - - - org.deeplearning4j - deeplearning4j-datasets - - - org.deeplearning4j - dl4j-spark-parameterserver_2.11 - - - org.deeplearning4j - dl4j-spark_2.11 + com.johnsnowlabs.nlp + spark-nlp_${scala.binary.version} - - - jfree - jfreechart - - - org.jfree - jcommon - - - - - eu.dnetlib - dnet-dedup-test - + + + + diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/GroupClassifier.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/GroupClassifier.java deleted file mode 100644 index 09435ff..0000000 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/GroupClassifier.java +++ /dev/null @@ -1,130 +0,0 @@ -//package eu.dnetlib.deeplearning; -// -///* ***************************************************************************** -// * -// * -// * -// * This program and the accompanying materials are made available under the -// * terms of the Apache License, Version 2.0 which is available at -// * https://www.apache.org/licenses/LICENSE-2.0. -// * See the NOTICE file distributed with this work for additional -// * information regarding copyright ownership. -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -// * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -// * License for the specific language governing permissions and limitations -// * under the License. -// * -// * SPDX-License-Identifier: Apache-2.0 -// ******************************************************************************/ -// -//import org.datavec.api.records.reader.RecordReader; -//import org.datavec.api.records.reader.impl.csv.CSVRecordReader; -//import org.datavec.api.split.FileSplit; -//import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator; -//import org.deeplearning4j.examples.utils.DownloaderUtility; -//import org.deeplearning4j.examples.utils.PlotUtil; -//import org.deeplearning4j.nn.conf.MultiLayerConfiguration; -//import org.deeplearning4j.nn.conf.NeuralNetConfiguration; -//import org.deeplearning4j.nn.conf.layers.DenseLayer; -//import org.deeplearning4j.nn.conf.layers.OutputLayer; -//import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; -//import org.deeplearning4j.nn.weights.WeightInit; -//import org.deeplearning4j.optimize.listeners.ScoreIterationListener; -//import org.nd4j.evaluation.classification.Evaluation; -//import org.nd4j.linalg.activations.Activation; -//import org.nd4j.linalg.api.ndarray.INDArray; -//import org.nd4j.linalg.dataset.DataSet; -//import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; -//import org.nd4j.linalg.learning.config.Nesterovs; -//import org.nd4j.linalg.lossfunctions.LossFunctions.LossFunction; -// -//import java.io.File; -//import java.util.concurrent.TimeUnit; -// -//public class GroupClassifier { -// -// public static boolean visualize = true; -// public static String dataLocalPath; -// -// public static void main(String[] args) throws Exception { -// int seed = 123; -// double learningRate = 0.01; -// int batchSize = 50; -// int nEpochs = 30; -// -// int numInputs = 2; -// int numOutputs = 2; -// int numHiddenNodes = 20; -// -// dataLocalPath = DownloaderUtility.CLASSIFICATIONDATA.Download(); -// //Load the training data: -// RecordReader rr = new CSVRecordReader(); -// rr.initialize(new FileSplit(new File(dataLocalPath, "linear_data_train.csv"))); -// DataSetIterator trainIter = new RecordReaderDataSetIterator(rr, batchSize, 0, 2); -// -// //Load the test/evaluation data: -// RecordReader rrTest = new CSVRecordReader(); -// rrTest.initialize(new FileSplit(new File(dataLocalPath, "linear_data_eval.csv"))); -// DataSetIterator testIter = new RecordReaderDataSetIterator(rrTest, batchSize, 0, 2); -// -// MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() -// .seed(seed) -// .weightInit(WeightInit.XAVIER) -// .updater(new Nesterovs(learningRate, 0.9)) -// .list() -// .layer(new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes) -// .activation(Activation.RELU) -// .build()) -// .layer(new OutputLayer.Builder(LossFunction.NEGATIVELOGLIKELIHOOD) -// .activation(Activation.SOFTMAX) -// .nIn(numHiddenNodes).nOut(numOutputs).build()) -// .build(); -// -// -// MultiLayerNetwork model = new MultiLayerNetwork(conf); -// model.init(); -// model.setListeners(new ScoreIterationListener(10)); //Print score every 10 parameter updates -// -// model.fit(trainIter, nEpochs); -// -// System.out.println("Evaluate model...."); -// Evaluation eval = new Evaluation(numOutputs); -// while (testIter.hasNext()) { -// DataSet t = testIter.next(); -// INDArray features = t.getFeatures(); -// INDArray labels = t.getLabels(); -// INDArray predicted = model.output(features, false); -// eval.eval(labels, predicted); -// } -// //An alternate way to do the above loop -// //Evaluation evalResults = model.evaluate(testIter); -// -// //Print the evaluation statistics -// System.out.println(eval.stats()); -// -// System.out.println("\n****************Example finished********************"); -// //Training is complete. Code that follows is for plotting the data & predictions only -// generateVisuals(model, trainIter, testIter); -// } -// -// public static void generateVisuals(MultiLayerNetwork model, DataSetIterator trainIter, DataSetIterator testIter) throws Exception { -// if (visualize) { -// double xMin = 0; -// double xMax = 1.0; -// double yMin = -0.2; -// double yMax = 0.8; -// int nPointsPerAxis = 100; -// -// //Generate x,y points that span the whole range of features -// INDArray allXYPoints = PlotUtil.generatePointsOnGraph(xMin, xMax, yMin, yMax, nPointsPerAxis); -// //Get train data and plot with predictions -// PlotUtil.plotTrainingData(model, trainIter, allXYPoints, nPointsPerAxis); -// TimeUnit.SECONDS.sleep(3); -// //Get test data, run the test data through the network to generate predictions, and plot those predictions: -// PlotUtil.plotTestData(model, testIter, allXYPoints, nPointsPerAxis); -// } -// } -//} -// diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/layers/GraphConvolutionVertex.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/layers/GraphConvolutionVertex.java deleted file mode 100644 index 6a16711..0000000 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/layers/GraphConvolutionVertex.java +++ /dev/null @@ -1,24 +0,0 @@ -package eu.dnetlib.deeplearning.layers; - -import org.deeplearning4j.nn.conf.graph.GraphVertex; -import org.deeplearning4j.nn.conf.layers.samediff.SDVertexParams; -import org.deeplearning4j.nn.conf.layers.samediff.SameDiffLambdaVertex; -import org.deeplearning4j.nn.conf.layers.samediff.SameDiffVertex; -import org.nd4j.autodiff.samediff.SDVariable; -import org.nd4j.autodiff.samediff.SameDiff; -import org.nd4j.linalg.api.ndarray.INDArray; - -import java.util.Map; - -public class GraphConvolutionVertex extends SameDiffLambdaVertex { - - @Override - public SDVariable defineVertex(SameDiff sameDiff, VertexInputs inputs) { - SDVariable features = inputs.getInput(0); - SDVariable adjacency = inputs.getInput(1); - SDVariable degree = inputs.getInput(2).pow(0.5); - - //result: DegreeMatrix^-0.5 x Adjacent x DegreeMatrix^-0.5 x Features - return degree.mmul(adjacency).mmul(degree).mmul(features); - } -} diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/layers/GraphGlobalAddPool.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/layers/GraphGlobalAddPool.java deleted file mode 100644 index 74f2f3f..0000000 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/layers/GraphGlobalAddPool.java +++ /dev/null @@ -1,21 +0,0 @@ -package eu.dnetlib.deeplearning.layers; - -import org.deeplearning4j.nn.conf.layers.samediff.SameDiffLambdaLayer; -import org.deeplearning4j.nn.conf.layers.samediff.SameDiffLayer; -import org.nd4j.autodiff.samediff.SDIndex; -import org.nd4j.autodiff.samediff.SDVariable; -import org.nd4j.autodiff.samediff.SameDiff; - -import java.util.Map; - -public class GraphGlobalAddPool extends SameDiffLambdaLayer { - - int size; - public GraphGlobalAddPool(int size) { - this.size = size; - } - @Override - public SDVariable defineLayer(SameDiff sameDiff, SDVariable layerInput) { - return layerInput.mean(0).reshape(1, size); //reshape because output layer expects 2-dimensional arrays - } -} diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/DataSetProcessor.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/DataSetProcessor.java deleted file mode 100644 index cfaf9d2..0000000 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/DataSetProcessor.java +++ /dev/null @@ -1,88 +0,0 @@ -package eu.dnetlib.deeplearning.support; - -import eu.dnetlib.featureextraction.Utilities; -import eu.dnetlib.support.Author; -import eu.dnetlib.support.ConnectedComponent; -import eu.dnetlib.support.Relation; -import org.apache.spark.api.java.JavaRDD; -import org.codehaus.jackson.map.ObjectMapper; -import org.jetbrains.annotations.NotNull; -import org.nd4j.linalg.api.ndarray.INDArray; -import org.nd4j.linalg.dataset.MultiDataSet; -import org.nd4j.linalg.factory.Nd4j; - -import java.io.IOException; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class DataSetProcessor { - - public static JavaRDD entityGroupToMultiDataset(JavaRDD groupEntity, String idJPath, String featureJPath, String groundTruthJPath) { - - return groupEntity.map(g -> { - Map featuresMap = new HashMap<>(); - List groundTruth = new ArrayList<>(); - Set entities = g.getDocs(); - for(String json:entities) { - featuresMap.put( - Utilities.getJPathString(idJPath, json), - Utilities.getJPathArray(featureJPath, json) - ); - groundTruth.add(Utilities.getJPathString(groundTruthJPath, json)); - } - - Set relations = g.getSimrels(); - - return getMultiDataSet(featuresMap, relations, groundTruth); - }); - } - - public static MultiDataSet getMultiDataSet(Map featuresMap, Set relations, List groundTruth) { - - List identifiers = new ArrayList<>(featuresMap.keySet()); - - int numNodes = identifiers.size(); - - //initialize arrays - INDArray adjacency = Nd4j.zeros(numNodes, numNodes); - INDArray features = Nd4j.zeros(numNodes, featuresMap.get(identifiers.get(0)).length); //feature size taken from the first element (it's equal for every element) - INDArray degree = Nd4j.zeros(numNodes, numNodes); - - //create adjacency - for(Relation r: relations) { - adjacency.put(identifiers.indexOf(r.getSource()), identifiers.indexOf(r.getTarget()), 1); - adjacency.put(identifiers.indexOf(r.getTarget()), identifiers.indexOf(r.getSource()), 1); - } - adjacency.addi(Nd4j.eye(numNodes)); - - //create degree and features - List degreeSupport = relations.stream().flatMap(r -> Stream.of(r.getSource(), r.getTarget())).collect(Collectors.toList()); - for(int i=0; i< identifiers.size(); i++) { - degree.put(i, i, Collections.frequency(degreeSupport, identifiers.get(i))); - features.putRow(i, Nd4j.create(featuresMap.get(identifiers.get(i)))); - } - - //infer label - INDArray label = Nd4j.zeros(1, 2); - if (groundTruth.stream().distinct().count()==1) { - //correct (same elements) - label.put(0, 0, 1.0); - } - else { - //wrong (different elements) - label.put(0, 1, 1.0); - } - - return new MultiDataSet( - new INDArray[]{ - features, - adjacency, - degree - }, - new INDArray[]{ - label - } - ); - } -} diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/GroupMultiDataSet.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/GroupMultiDataSet.java deleted file mode 100644 index b6734dd..0000000 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/GroupMultiDataSet.java +++ /dev/null @@ -1,11 +0,0 @@ -package eu.dnetlib.deeplearning.support; - -import org.nd4j.linalg.api.ndarray.INDArray; -import org.nd4j.linalg.dataset.MultiDataSet; - -import java.io.*; -import java.util.List; - -public class GroupMultiDataSet extends MultiDataSet { - -} diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/NetworkConfigurations.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/NetworkConfigurations.java deleted file mode 100644 index 52db977..0000000 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/NetworkConfigurations.java +++ /dev/null @@ -1,97 +0,0 @@ -package eu.dnetlib.deeplearning.support; - -import eu.dnetlib.deeplearning.layers.GraphConvolutionVertex; -import eu.dnetlib.deeplearning.layers.GraphGlobalAddPool; -import org.bytedeco.opencv.opencv_dnn.PoolingLayer; -import org.deeplearning4j.nn.conf.ComputationGraphConfiguration; -import org.deeplearning4j.nn.conf.MultiLayerConfiguration; -import org.deeplearning4j.nn.conf.NeuralNetConfiguration; -import org.deeplearning4j.nn.conf.graph.MergeVertex; -import org.deeplearning4j.nn.conf.layers.*; -import org.deeplearning4j.nn.weights.WeightInit; -import org.nd4j.autodiff.samediff.SDVariable; -import org.nd4j.linalg.activations.Activation; -import org.nd4j.linalg.api.buffer.DataType; -import org.nd4j.linalg.learning.config.Adam; -import org.nd4j.linalg.learning.config.Nesterovs; -import org.nd4j.linalg.lossfunctions.LossFunctions; - -public class NetworkConfigurations { - - //parameteres default values - protected static final int SEED = 12345; - protected static final double LEARNING_RATE = 1e-3; - protected static final String ADJACENCY_MATRIX = "adjacency"; - protected static final String FEATURES_MATRIX = "features"; - protected static final String DEGREE_MATRIX = "degrees"; - - public static MultiLayerConfiguration getLinearDataClassifier(int numInputs, int numHiddenNodes, int numOutputs) { - return new NeuralNetConfiguration.Builder() - .seed(SEED) - .weightInit(WeightInit.XAVIER) - .updater(new Nesterovs(LEARNING_RATE, 0.9)) - .list() - .layer(new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes) - .activation(Activation.RELU) - .build()) - .layer(new OutputLayer.Builder(LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD) - .activation(Activation.SOFTMAX) - .nIn(numHiddenNodes).nOut(numOutputs).build()) - .build(); - } - - public static ComputationGraphConfiguration getSimpleGCN(int numLayers, int numInputs, int numHiddenNodes, int numClasses) { - - ComputationGraphConfiguration.GraphBuilder baseConfig = new NeuralNetConfiguration.Builder() - .seed(SEED) - .updater(new Adam(LEARNING_RATE)) - .weightInit(WeightInit.XAVIER) - .graphBuilder() - .addInputs(FEATURES_MATRIX, ADJACENCY_MATRIX, DEGREE_MATRIX) - //first convolution layer - .addVertex("layer1", - new GraphConvolutionVertex(), - FEATURES_MATRIX, ADJACENCY_MATRIX, DEGREE_MATRIX) - .layer("conv1", - new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes) - .activation(Activation.RELU) - .build(), - "layer1") - .layer("batch1", - new BatchNormalization.Builder().nOut(numHiddenNodes).build(), - "conv1"); - - //ad as many layers as requested - for(int i=2; i<=numLayers; i++) { - baseConfig = baseConfig.addVertex("layer" + i, - new GraphConvolutionVertex(), - "batch" + (i-1), ADJACENCY_MATRIX, DEGREE_MATRIX) - .layer("conv" + i, - new DenseLayer.Builder().nIn(numHiddenNodes).nOut(numHiddenNodes) - .activation(Activation.RELU) - .build(), - "layer" + i) - .layer("batch" + i, - new BatchNormalization.Builder().nOut(numHiddenNodes).build(), - "conv" + i); - } - - baseConfig = baseConfig - .layer("pool", - new GraphGlobalAddPool(numHiddenNodes), - "batch" + numLayers) - .layer("fc1", - new DenseLayer.Builder().nIn(numHiddenNodes).nOut(numHiddenNodes) - .activation(Activation.RELU) - .weightInit(WeightInit.XAVIER) - .build(), - "pool") - .layer("out", - new OutputLayer.Builder(LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD) - .activation(Activation.SOFTMAX) - .nIn(numHiddenNodes).nOut(numClasses).build(), - "fc1"); - - return baseConfig.setOutputs("out").build(); - } -} diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/PlotUtils.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/PlotUtils.java deleted file mode 100644 index 3653b1a..0000000 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/deeplearning/support/PlotUtils.java +++ /dev/null @@ -1,253 +0,0 @@ -//package eu.dnetlib.deeplearning.support; -// -//import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; -//import org.jfree.chart.ChartPanel; -//import org.jfree.chart.ChartUtilities; -//import org.jfree.chart.JFreeChart; -//import org.jfree.chart.axis.AxisLocation; -//import org.jfree.chart.axis.NumberAxis; -//import org.jfree.chart.block.BlockBorder; -//import org.jfree.chart.plot.DatasetRenderingOrder; -//import org.jfree.chart.plot.XYPlot; -//import org.jfree.chart.renderer.GrayPaintScale; -//import org.jfree.chart.renderer.PaintScale; -//import org.jfree.chart.renderer.xy.XYBlockRenderer; -//import org.jfree.chart.renderer.xy.XYLineAndShapeRenderer; -//import org.jfree.chart.title.PaintScaleLegend; -//import org.jfree.data.xy.*; -//import org.jfree.ui.RectangleEdge; -//import org.jfree.ui.RectangleInsets; -//import org.nd4j.linalg.api.ndarray.INDArray; -//import org.nd4j.linalg.api.ops.impl.indexaccum.custom.ArgMax; -//import org.nd4j.linalg.dataset.DataSet; -//import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; -//import org.nd4j.linalg.factory.Nd4j; -// -//import javax.swing.*; -//import java.awt.*; -//import java.util.ArrayList; -//import java.util.List; -// -///** -// * Simple plotting methods for the MLPClassifier quickstartexamples -// * -// * @author Alex Black -// */ -//public class PlotUtils { -// -// /** -// * Plot the training data. Assume 2d input, classification output -// * -// * @param model Model to use to get predictions -// * @param trainIter DataSet Iterator -// * @param backgroundIn sets of x,y points in input space, plotted in the background -// * @param nDivisions Number of points (per axis, for the backgroundIn/backgroundOut arrays) -// */ -// public static void plotTrainingData(MultiLayerNetwork model, DataSetIterator trainIter, INDArray backgroundIn, int nDivisions) { -// double[] mins = backgroundIn.min(0).data().asDouble(); -// double[] maxs = backgroundIn.max(0).data().asDouble(); -// -// DataSet ds = allBatches(trainIter); -// INDArray backgroundOut = model.output(backgroundIn); -// -// XYZDataset backgroundData = createBackgroundData(backgroundIn, backgroundOut); -// JPanel panel = new ChartPanel(createChart(backgroundData, mins, maxs, nDivisions, createDataSetTrain(ds.getFeatures(), ds.getLabels()))); -// -// JFrame f = new JFrame(); -// f.add(panel); -// f.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE); -// f.pack(); -// f.setTitle("Training Data"); -// -// f.setVisible(true); -// f.setLocation(0, 0); -// } -// -// /** -// * Plot the training data. Assume 2d input, classification output -// * -// * @param model Model to use to get predictions -// * @param testIter Test Iterator -// * @param backgroundIn sets of x,y points in input space, plotted in the background -// * @param nDivisions Number of points (per axis, for the backgroundIn/backgroundOut arrays) -// */ -// public static void plotTestData(MultiLayerNetwork model, DataSetIterator testIter, INDArray backgroundIn, int nDivisions) { -// -// double[] mins = backgroundIn.min(0).data().asDouble(); -// double[] maxs = backgroundIn.max(0).data().asDouble(); -// -// INDArray backgroundOut = model.output(backgroundIn); -// XYZDataset backgroundData = createBackgroundData(backgroundIn, backgroundOut); -// DataSet ds = allBatches(testIter); -// INDArray predicted = model.output(ds.getFeatures()); -// JPanel panel = new ChartPanel(createChart(backgroundData, mins, maxs, nDivisions, createDataSetTest(ds.getFeatures(), ds.getLabels(), predicted))); -// -// JFrame f = new JFrame(); -// f.add(panel); -// f.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE); -// f.pack(); -// f.setTitle("Test Data"); -// -// f.setVisible(true); -// f.setLocationRelativeTo(null); -// //f.setLocation(100,100); -// -// } -// -// -// /** -// * Create data for the background data set -// */ -// private static XYZDataset createBackgroundData(INDArray backgroundIn, INDArray backgroundOut) { -// int nRows = backgroundIn.rows(); -// double[] xValues = new double[nRows]; -// double[] yValues = new double[nRows]; -// double[] zValues = new double[nRows]; -// for (int i = 0; i < nRows; i++) { -// xValues[i] = backgroundIn.getDouble(i, 0); -// yValues[i] = backgroundIn.getDouble(i, 1); -// zValues[i] = backgroundOut.getDouble(i, 0); -// -// } -// -// DefaultXYZDataset dataset = new DefaultXYZDataset(); -// dataset.addSeries("Series 1", -// new double[][]{xValues, yValues, zValues}); -// return dataset; -// } -// -// //Training data -// private static XYDataset createDataSetTrain(INDArray features, INDArray labels) { -// int nRows = features.rows(); -// -// int nClasses = 2; // Binary classification using one output call end sigmoid. -// -// XYSeries[] series = new XYSeries[nClasses]; -// for (int i = 0; i < series.length; i++) series[i] = new XYSeries("Class " + i); -// INDArray argMax = Nd4j.getExecutioner().exec(new ArgMax(new INDArray[]{labels},false,new int[]{1}))[0]; -// for (int i = 0; i < nRows; i++) { -// int classIdx = (int) argMax.getDouble(i); -// series[classIdx].add(features.getDouble(i, 0), features.getDouble(i, 1)); -// } -// -// XYSeriesCollection c = new XYSeriesCollection(); -// for (XYSeries s : series) c.addSeries(s); -// return c; -// } -// -// //Test data -// private static XYDataset createDataSetTest(INDArray features, INDArray labels, INDArray predicted) { -// int nRows = features.rows(); -// -// int nClasses = 2; // Binary classification using one output call end sigmoid. -// -// XYSeries[] series = new XYSeries[nClasses * nClasses]; -// int[] series_index = new int[]{0, 3, 2, 1}; //little hack to make the charts look consistent. -// for (int i = 0; i < nClasses * nClasses; i++) { -// int trueClass = i / nClasses; -// int predClass = i % nClasses; -// String label = "actual=" + trueClass + ", pred=" + predClass; -// series[series_index[i]] = new XYSeries(label); -// } -// INDArray actualIdx = labels.argMax(1); -// INDArray predictedIdx = predicted.argMax(1); -// for (int i = 0; i < nRows; i++) { -// int classIdx = actualIdx.getInt(i); -// int predIdx = predictedIdx.getInt(i); -// int idx = series_index[classIdx * nClasses + predIdx]; -// series[idx].add(features.getDouble(i, 0), features.getDouble(i, 1)); -// } -// -// XYSeriesCollection c = new XYSeriesCollection(); -// for (XYSeries s : series) c.addSeries(s); -// return c; -// } -// -// private static JFreeChart createChart(XYZDataset dataset, double[] mins, double[] maxs, int nPoints, XYDataset xyData) { -// NumberAxis xAxis = new NumberAxis("X"); -// xAxis.setRange(mins[0], maxs[0]); -// -// -// NumberAxis yAxis = new NumberAxis("Y"); -// yAxis.setRange(mins[1], maxs[1]); -// -// XYBlockRenderer renderer = new XYBlockRenderer(); -// renderer.setBlockWidth((maxs[0] - mins[0]) / (nPoints - 1)); -// renderer.setBlockHeight((maxs[1] - mins[1]) / (nPoints - 1)); -// PaintScale scale = new GrayPaintScale(0, 1.0); -// renderer.setPaintScale(scale); -// XYPlot plot = new XYPlot(dataset, xAxis, yAxis, renderer); -// plot.setBackgroundPaint(Color.lightGray); -// plot.setDomainGridlinesVisible(false); -// plot.setRangeGridlinesVisible(false); -// plot.setAxisOffset(new RectangleInsets(5, 5, 5, 5)); -// JFreeChart chart = new JFreeChart("", plot); -// chart.getXYPlot().getRenderer().setSeriesVisibleInLegend(0, false); -// -// -// NumberAxis scaleAxis = new NumberAxis("Probability (class 1)"); -// scaleAxis.setAxisLinePaint(Color.white); -// scaleAxis.setTickMarkPaint(Color.white); -// scaleAxis.setTickLabelFont(new Font("Dialog", Font.PLAIN, 7)); -// PaintScaleLegend legend = new PaintScaleLegend(new GrayPaintScale(), -// scaleAxis); -// legend.setStripOutlineVisible(false); -// legend.setSubdivisionCount(20); -// legend.setAxisLocation(AxisLocation.BOTTOM_OR_LEFT); -// legend.setAxisOffset(5.0); -// legend.setMargin(new RectangleInsets(5, 5, 5, 5)); -// legend.setFrame(new BlockBorder(Color.red)); -// legend.setPadding(new RectangleInsets(10, 10, 10, 10)); -// legend.setStripWidth(10); -// legend.setPosition(RectangleEdge.LEFT); -// chart.addSubtitle(legend); -// -// ChartUtilities.applyCurrentTheme(chart); -// -// plot.setDataset(1, xyData); -// XYLineAndShapeRenderer renderer2 = new XYLineAndShapeRenderer(); -// renderer2.setBaseLinesVisible(false); -// plot.setRenderer(1, renderer2); -// -// plot.setDatasetRenderingOrder(DatasetRenderingOrder.FORWARD); -// -// return chart; -// } -// -// public static INDArray generatePointsOnGraph(double xMin, double xMax, double yMin, double yMax, int nPointsPerAxis) { -// //generate all the x,y points -// double[][] evalPoints = new double[nPointsPerAxis * nPointsPerAxis][2]; -// int count = 0; -// for (int i = 0; i < nPointsPerAxis; i++) { -// for (int j = 0; j < nPointsPerAxis; j++) { -// double x = i * (xMax - xMin) / (nPointsPerAxis - 1) + xMin; -// double y = j * (yMax - yMin) / (nPointsPerAxis - 1) + yMin; -// -// evalPoints[count][0] = x; -// evalPoints[count][1] = y; -// -// count++; -// } -// } -// -// return Nd4j.create(evalPoints); -// } -// -// /** -// * This is to collect all the data and return it as one minibatch. Obviously only for use here with small datasets -// * @param iter -// * @return -// */ -// private static DataSet allBatches(DataSetIterator iter) { -// -// List fullSet = new ArrayList<>(); -// iter.reset(); -// while (iter.hasNext()) { -// List miniBatchList = iter.next().asList(); -// fullSet.addAll(miniBatchList); -// } -// iter.reset(); -// return new ListDataSetIterator<>(fullSet,fullSet.size()).next(); -// } -// -//} diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/example/Example.scala b/dnet-feature-extraction/src/main/java/eu/dnetlib/example/Example.scala new file mode 100644 index 0000000..38fe366 --- /dev/null +++ b/dnet-feature-extraction/src/main/java/eu/dnetlib/example/Example.scala @@ -0,0 +1,76 @@ +//package eu.dnetlib.example +// +//import com.intel.analytics.bigdl.dllib.NNContext +//import com.intel.analytics.bigdl.dllib.keras.Model +//import com.intel.analytics.bigdl.dllib.keras.models.Models +//import com.intel.analytics.bigdl.dllib.keras.optimizers.Adam +//import com.intel.analytics.bigdl.dllib.nn.ClassNLLCriterion +//import com.intel.analytics.bigdl.dllib.utils.Shape +//import com.intel.analytics.bigdl.dllib.keras.layers._ +//import com.intel.analytics.bigdl.numeric.NumericFloat +//import org.apache.spark.ml.feature.VectorAssembler +//import org.apache.spark._ +//import org.apache.spark.sql.{SQLContext, SparkSession} +//import org.apache.spark.sql.functions._ +//import org.apache.spark.sql.types.DoubleType +//object Example { +// +// +// def main(args: Array[String]): Unit = { +// +// val conf = new SparkConf().setMaster("local[2]").setAppName("dllib_demo") +// val sc = NNContext.initNNContext(conf) +// +//// val spark = new SQLContext(sc) //deprecated +// val spark = SparkSession +// .builder() +// .config(sc.getConf) +// .getOrCreate() +// +// val path = "/Users/miconis/Desktop/example_dataset.csv" +// val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",")).csv(path) +// .toDF("num_times_pregrant", "plasma_glucose", "blood_pressure", "skin_fold_thickness", "2-hour_insulin", "body_mass_index", "diabetes_pedigree_function", "age", "class") +// +// val assembler = new VectorAssembler() +// .setInputCols(Array("num_times_pregrant", "plasma_glucose", "blood_pressure", "skin_fold_thickness", "2-hour_insulin", "body_mass_index", "diabetes_pedigree_function", "age")) +// .setOutputCol("features") +// val assembleredDF = assembler.transform(df) +// val df2 = assembleredDF.withColumn("label", col("class").cast(DoubleType) + lit(1)) +// +// val Array(trainDF, valDF) = df2.randomSplit(Array(0.8, 0.2)) +// +// val x1 = Input(Shape(8)) +// val merge = Merge.merge(inputs = List(x1, x1), mode = "dot") +// val dense1 = Dense(12, activation="relu").inputs(x1) +// val dense2 = Dense(8, activation="relu").inputs(dense1) +// val dense3 = Dense(2, activation="relu").inputs(dense2) +// val dmodel = Model(x1, dense3) +// +// dmodel.compile(optimizer = new Adam(), loss = ClassNLLCriterion()) +// +// +// //training +// dmodel.fit(x = trainDF, batchSize = 4, nbEpoch = 2, featureCols = Array("features"), labelCols = Array("label"), valX = valDF) +// +// +//// //save model +//// val modelPath = "/tmp/demo/keras.model" +//// dmodel.saveModel(modelPath) +//// +//// +//// //load model +//// val loadModel = Models.loadModel(modelPath) +// val loadModel = dmodel +// +// //inference +// val preDF2 = loadModel.predict(valDF, featureCols = Array("features"), predictionCol = "predict") +// +// preDF2.show(false) +// +// //evaluation +// val ret = dmodel.evaluate(trainDF, batchSize = 4, featureCols = Array("features"), labelCols = Array("label")) +// +// ret.foreach(println) +// +// } +//} diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/FeatureTransformer.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/FeatureTransformer.java index e55a99a..1f5c8a2 100644 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/FeatureTransformer.java +++ b/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/FeatureTransformer.java @@ -1,5 +1,11 @@ package eu.dnetlib.featureextraction; +import com.google.common.collect.Lists; +import com.johnsnowlabs.nlp.*; +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector; +import com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings; +import org.apache.spark.ml.Pipeline; +import org.apache.spark.ml.PipelineStage; import org.apache.spark.ml.clustering.LDA; import org.apache.spark.ml.clustering.LDAModel; import org.apache.spark.ml.feature.CountVectorizer; @@ -8,16 +14,20 @@ import org.apache.spark.ml.feature.StopWordsRemover; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; import scala.Tuple2; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Serializable; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.net.URISyntaxException; +import java.nio.file.Paths; +import java.util.*; public class FeatureTransformer implements Serializable { @@ -161,4 +171,5 @@ public class FeatureTransformer implements Serializable { public static Dataset ldaInference(Dataset inputDS, LDAModel ldaModel) { return ldaModel.transform(inputDS).select(ID_COL, LDA_INFERENCE_OUTPUT_COL); } + } diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/ScalaFeatureTransformer.scala b/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/ScalaFeatureTransformer.scala new file mode 100644 index 0000000..1bcf09b --- /dev/null +++ b/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/ScalaFeatureTransformer.scala @@ -0,0 +1,157 @@ +package eu.dnetlib.featureextraction + +import com.johnsnowlabs.nlp.EmbeddingsFinisher +import com.johnsnowlabs.nlp.annotator.SentenceDetector +import com.johnsnowlabs.nlp.annotators.Tokenizer +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.embeddings.{BertEmbeddings, BertSentenceEmbeddings, WordEmbeddingsModel} +import org.apache.spark.ml.Pipeline +import org.apache.spark.sql.functions.{array, col, explode} +import org.apache.spark.sql.{Dataset, Row} + +import java.nio.file.Paths + +object ScalaFeatureTransformer { + + val DOCUMENT_COL = "document" + val SENTENCE_COL = "sentence" + val BERT_SENTENCE_EMBEDDINGS_COL = "bert_sentence" + val BERT_EMBEDDINGS_COL = "bert" + val TOKENIZER_COL = "tokens" + val WORD_EMBEDDINGS_COL = "word" + + //models path + private val bertSentenceModelPath = Paths.get(getClass.getResource("/eu/dnetlib/featureextraction/support/sent_small_bert_L6_512_en_2.6.0_2.4_1598350624049").toURI).toFile.getAbsolutePath + private val bertModelPath = Paths.get(getClass.getResource("/eu/dnetlib/featureextraction/support/small_bert_L2_128_en_2.6.0_2.4_1598344320681").toURI).toFile.getAbsolutePath + private val wordModelPath = Paths.get(getClass.getResource("/eu/dnetlib/featureextraction/support/glove_100d_en_2.4.0_2.4_1579690104032").toURI).toFile.getAbsolutePath + + /** + * Extract the SentenceBERT embeddings for the given field. + * + * @param inputData: the input data + * @param inputField: the input field + * @return the dataset with the embeddings + * */ + def bertSentenceEmbeddings(inputData: Dataset[Row], inputField: String, modelPath: String): Dataset[Row] = { + + val documentAssembler = new DocumentAssembler() + .setInputCol(inputField) + .setOutputCol(DOCUMENT_COL) + + val sentence = new SentenceDetector() + .setInputCols(DOCUMENT_COL) + .setOutputCol(SENTENCE_COL) + + val bertSentenceEmbeddings = BertSentenceEmbeddings + .load(modelPath) + .setInputCols(SENTENCE_COL) + .setOutputCol("raw_" + BERT_SENTENCE_EMBEDDINGS_COL) + .setCaseSensitive(false) + + val bertSentenceEmbeddingsFinisher = new EmbeddingsFinisher() + .setInputCols("raw_" + BERT_SENTENCE_EMBEDDINGS_COL) + .setOutputCols(BERT_SENTENCE_EMBEDDINGS_COL) + .setOutputAsVector(true) + .setCleanAnnotations(false) + + val pipeline = new Pipeline() + .setStages(Array( + documentAssembler, + sentence, + bertSentenceEmbeddings, + bertSentenceEmbeddingsFinisher + )) + + val result = pipeline.fit(inputData).transform(inputData).withColumn(BERT_SENTENCE_EMBEDDINGS_COL, explode(col(BERT_SENTENCE_EMBEDDINGS_COL))) + + result + } + + /** + * Extract the BERT embeddings for the given field. + * + * @param inputData : the input data + * @param inputField : the input field + * @return the dataset with the embeddings + * */ + def bertEmbeddings(inputData: Dataset[Row], inputField: String, modelPath: String): Dataset[Row] = { + + val documentAssembler = new DocumentAssembler() + .setInputCol(inputField) + .setOutputCol(DOCUMENT_COL) + + val tokenizer = new Tokenizer() + .setInputCols(DOCUMENT_COL) + .setOutputCol(TOKENIZER_COL) + + val bertEmbeddings = BertEmbeddings + .load(modelPath) + .setInputCols(TOKENIZER_COL, DOCUMENT_COL) + .setOutputCol("raw_" + BERT_EMBEDDINGS_COL) + .setCaseSensitive(false) + + val bertEmbeddingsFinisher = new EmbeddingsFinisher() + .setInputCols("raw_" + BERT_EMBEDDINGS_COL) + .setOutputCols(BERT_EMBEDDINGS_COL) + .setOutputAsVector(true) + .setCleanAnnotations(false) + + val pipeline = new Pipeline() + .setStages(Array( + documentAssembler, + tokenizer, + bertEmbeddings, + bertEmbeddingsFinisher + )) + + val result = pipeline.fit(inputData).transform(inputData).withColumn(BERT_EMBEDDINGS_COL, explode(col(BERT_EMBEDDINGS_COL))) + + result + } + + /** + * Extract the Word2Vec embeddings for the given field. + * + * @param inputData : the input data + * @param inputField : the input field + * @return the dataset with the embeddings + * */ + def wordEmbeddings(inputData: Dataset[Row], inputField: String, modelPath: String): Dataset[Row] = { + + val documentAssembler = new DocumentAssembler() + .setInputCol(inputField) + .setOutputCol(DOCUMENT_COL) + + val tokenizer = new Tokenizer() + .setInputCols(DOCUMENT_COL) + .setOutputCol(TOKENIZER_COL) + + val wordEmbeddings = WordEmbeddingsModel + .load(modelPath) + .setInputCols(DOCUMENT_COL, TOKENIZER_COL) + .setOutputCol("raw_" + WORD_EMBEDDINGS_COL) + + val wordEmbeddingsFinisher = new EmbeddingsFinisher() + .setInputCols("raw_" + WORD_EMBEDDINGS_COL) + .setOutputCols(WORD_EMBEDDINGS_COL) + .setOutputAsVector(true) + .setCleanAnnotations(false) + + val pipeline = new Pipeline() + .setStages(Array( + documentAssembler, + tokenizer, + wordEmbeddings, + wordEmbeddingsFinisher + )) + + val result = pipeline.fit(inputData).transform(inputData).withColumn(WORD_EMBEDDINGS_COL, explode(col(WORD_EMBEDDINGS_COL))) + + result + } + + //bert on the title + //bert sentence: on the abstract + //word2vec: on the subjects + +} diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/support/Author.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/support/Author.java index 9d65a7d..aee3b1e 100644 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/support/Author.java +++ b/dnet-feature-extraction/src/main/java/eu/dnetlib/support/Author.java @@ -5,6 +5,7 @@ import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; import java.util.List; +import java.util.Map; public class Author implements Serializable { @@ -12,24 +13,32 @@ public class Author implements Serializable { public String firstname; public String lastname; public List coAuthors; - public double[] topics; public String orcid; public String id; + public Map embeddings; - public String pubId; - - public Author() { + public Map getEmbeddings() { + return embeddings; } - public Author(String fullname, String firstname, String lastname, List coAuthors, double[] topics, String id, String pubId, String orcid) { + public Author(String fullname, String firstname, String lastname, List coAuthors, String orcid, String id, Map embeddings, String pubId) { this.fullname = fullname; this.firstname = firstname; this.lastname = lastname; this.coAuthors = coAuthors; - this.topics = topics; - this.id = id; - this.pubId = pubId; this.orcid = orcid; + this.id = id; + this.embeddings = embeddings; + this.pubId = pubId; + } + + public void setEmbeddings(Map embeddings) { + this.embeddings = embeddings; + } + + public String pubId; + + public Author() { } public String getFullname() { @@ -64,14 +73,6 @@ public class Author implements Serializable { this.coAuthors = coAuthors; } - public double[] getTopics() { - return topics; - } - - public void setTopics(double[] topics) { - this.topics = topics; - } - public String getId() { return id; } diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/support/AuthorsFactory.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/support/AuthorsFactory.java index 78e6e66..6540f4e 100644 --- a/dnet-feature-extraction/src/main/java/eu/dnetlib/support/AuthorsFactory.java +++ b/dnet-feature-extraction/src/main/java/eu/dnetlib/support/AuthorsFactory.java @@ -14,34 +14,32 @@ import javax.rmi.CORBA.Util; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; +import java.util.*; import java.util.stream.Collectors; public class AuthorsFactory { - public static JavaRDD extractAuthorsFromPublications(JavaRDD entities, JavaPairRDD topics) { + public static JavaRDD extractAuthorsFromPublications(JavaRDD publications, JavaPairRDD> topics) { - JavaPairRDD publicationWithTopics = entities.map(x -> new ObjectMapper().configure(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false).readValue(x, Publication.class)) + //read topics + JavaPairRDD> publicationWithEmbeddings = publications .mapToPair(p -> new Tuple2<>(p.getId(), p)) .join(topics) .mapToPair(Tuple2::_2); - return publicationWithTopics.flatMap(p -> createAuthors(p)); + return publicationWithEmbeddings.flatMap(AuthorsFactory::createAuthors); } - public static Iterator createAuthors(Tuple2 publicationWithTopic){ - List baseCoAuthors = publicationWithTopic._1() + public static Iterator createAuthors(Tuple2> publicationWithEmbeddings){ + List baseCoAuthors = publicationWithEmbeddings._1() .getAuthor() .stream() .map(a -> new CoAuthor(a.getFullname(), a.getName()!=null?a.getName():"", a.getSurname()!=null?a.getSurname():"", a.getPid().size()>0? a.getPid().get(0).getValue():"")) .collect(Collectors.toList()); List authors = new ArrayList<>(); - for(eu.dnetlib.dhp.schema.oaf.Author a : publicationWithTopic._1().getAuthor()) { + for(eu.dnetlib.dhp.schema.oaf.Author a : publicationWithEmbeddings._1().getAuthor()) { //prepare orcid String orcid = a.getPid().size()>0? a.getPid().get(0).getValue() : ""; @@ -50,9 +48,19 @@ public class AuthorsFactory { coAuthors.remove(new CoAuthor(a.getFullname(), a.getName() != null ? a.getName() : "", a.getSurname() != null ? a.getSurname() : "", a.getPid().size() > 0 ? a.getPid().get(0).getValue() : "")); //prepare raw author id - String id = "author::" + getMd5(a.getFullname().concat(publicationWithTopic._1().getId())); + String id = "author::" + getMd5(a.getFullname().concat(publicationWithEmbeddings._1().getId())); - authors.add(new Author(a.getFullname(), a.getName(), a.getSurname(), coAuthors, publicationWithTopic._2().toArray(), id, publicationWithTopic._1().getId(), orcid)); + //prepare embeddings + authors.add(new Author( + a.getFullname(), + a.getName(), + a.getSurname(), + coAuthors, + orcid, + id, + publicationWithEmbeddings._2(), + publicationWithEmbeddings._1().getId()) + ); } return authors.iterator(); diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/support/ConnectedComponent.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/support/ConnectedComponent.java new file mode 100644 index 0000000..4276be6 --- /dev/null +++ b/dnet-feature-extraction/src/main/java/eu/dnetlib/support/ConnectedComponent.java @@ -0,0 +1,70 @@ +package eu.dnetlib.support; + +import com.google.common.collect.Sets; +import org.codehaus.jackson.map.ObjectMapper; + +import java.io.IOException; +import java.io.Serializable; +import java.util.HashSet; +import java.util.Set; + +public class ConnectedComponent implements Serializable { + + private HashSet docs; + private String ccId; + private HashSet simrels; + + public ConnectedComponent() { + } + + public ConnectedComponent(String ccId, Set docs, Set simrels) { + this.docs = new HashSet<>(docs); + this.ccId = ccId; + this.simrels = new HashSet<>(simrels); + } + + public ConnectedComponent(Set docs) { + this.docs = new HashSet<>(docs); + //initialization of id and relations missing + } + + public ConnectedComponent(String ccId, Iterable docs, Iterable simrels) { + this.ccId = ccId; + this.docs = Sets.newHashSet(docs); + this.simrels = Sets.newHashSet(simrels); + } + + @Override + public String toString() { + ObjectMapper mapper = new ObjectMapper(); + try { + return mapper.writeValueAsString(this); + } catch (IOException e) { + throw new RuntimeException("Failed to create Json: ", e); + } + } + + public Set getDocs() { + return docs; + } + + public void setDocs(HashSet docs) { + this.docs = docs; + } + + public String getCcId() { + return ccId; + } + + public void setCcId(String ccId) { + this.ccId = ccId; + } + + public void setSimrels(HashSet simrels) { + this.simrels = simrels; + } + + public HashSet getSimrels() { + return simrels; + } +} diff --git a/dnet-feature-extraction/src/main/java/eu/dnetlib/support/Relation.java b/dnet-feature-extraction/src/main/java/eu/dnetlib/support/Relation.java new file mode 100644 index 0000000..f741e9d --- /dev/null +++ b/dnet-feature-extraction/src/main/java/eu/dnetlib/support/Relation.java @@ -0,0 +1,52 @@ +package eu.dnetlib.support; + +import java.io.Serializable; + +public class Relation implements Serializable { + + String source; + String target; + String type; + + public Relation() { + } + + public Relation(String source, String target, String type) { + this.source = source; + this.target = target; + this.type = type; + } + + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + public String getTarget() { + return target; + } + + public void setTarget(String target) { + this.target = target; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + @Override + public String toString() { + return "Relation{" + + "source='" + source + '\'' + + ", target='" + target + '\'' + + ", type='" + type + '\'' + + '}'; + } +} diff --git a/dnet-feature-extraction/src/test/java/UtilityTest.java b/dnet-feature-extraction/src/test/java/UtilityTest.java index 386a41b..6b59e29 100644 --- a/dnet-feature-extraction/src/test/java/UtilityTest.java +++ b/dnet-feature-extraction/src/test/java/UtilityTest.java @@ -7,6 +7,7 @@ import org.junit.jupiter.api.Test; import java.lang.annotation.Target; import java.util.ArrayList; +import java.util.HashMap; public class UtilityTest { @@ -24,7 +25,7 @@ public class UtilityTest { @Test public void lnfiTest() throws Exception { - Author a = new Author("De Bonis, Michele", "Æ", "De Bonis", new ArrayList(), new double[]{0.0, 1.0}, "author::id", "pub::id", "orcid"); + Author a = new Author("De Bonis, Michele", "Æ", "De Bonis", new ArrayList(), "orcid", "author::id", new HashMap(), "pub::id"); System.out.println("a = " + a.isAccurate()); System.out.println(AuthorsFactory.getLNFI(a)); } diff --git a/dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/DataSetProcessorTest.java b/dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/DataSetProcessorTest.java deleted file mode 100644 index 0b0f2c7..0000000 --- a/dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/DataSetProcessorTest.java +++ /dev/null @@ -1,47 +0,0 @@ -package eu.dnetlib.deeplearning; - -import com.beust.jcommander.internal.Sets; -import com.google.common.collect.Lists; -import eu.dnetlib.deeplearning.support.DataSetProcessor; -import eu.dnetlib.support.Relation; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import org.nd4j.linalg.dataset.MultiDataSet; - -import java.util.*; -import java.util.stream.Collectors; - -public class DataSetProcessorTest { - - static Map features; - static Set relations; - static List groundTruth; - - @BeforeAll - public static void init(){ - //initialize example features - features = new HashMap<>(); - features.put("0", new double[]{0.0,0.0}); - features.put("1", new double[]{1.0,1.0}); - features.put("2", new double[]{2.0,2.0}); - - //initialize example relations - relations = new HashSet<>(Lists.newArrayList( - new Relation("0", "1", "simrel"), - new Relation("1", "2", "simrel") - )); - - //initialize example ground truth - groundTruth = Lists.newArrayList("class1", "class1", "class2"); - - } - - @Test - public void getMultiDataSetTest() throws Exception { - MultiDataSet multiDataSet = DataSetProcessor.getMultiDataSet(features, relations, groundTruth); - System.out.println("multiDataSet = " + multiDataSet); - - multiDataSet.asList(); - } - -} diff --git a/dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/NetworkConfigurationTests.java b/dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/NetworkConfigurationTests.java deleted file mode 100644 index ad791cb..0000000 --- a/dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/NetworkConfigurationTests.java +++ /dev/null @@ -1,33 +0,0 @@ -package eu.dnetlib.deeplearning; - -import eu.dnetlib.deeplearning.support.NetworkConfigurations; -import org.deeplearning4j.nn.conf.ComputationGraphConfiguration; -import org.deeplearning4j.nn.graph.ComputationGraph; -import org.junit.jupiter.api.Test; -import org.nd4j.linalg.api.ndarray.INDArray; -import org.nd4j.linalg.factory.Nd4j; - -public class NetworkConfigurationTests { - - public final static int N = 3; //number of nodes - public final static int K = 7; //number of features - - public static INDArray[] exampleGraph = new INDArray[]{ - Nd4j.zeros(N, K), //features - Nd4j.ones(N, N), //adjacency - Nd4j.ones(N, N) //degree - }; - - @Test - public void simpleGCNTest() { - - ComputationGraphConfiguration simpleGCNConf = NetworkConfigurations.getSimpleGCN(3, K, 5, 2); - ComputationGraph simpleGCN = new ComputationGraph(simpleGCNConf); - simpleGCN.init(); - - INDArray[] output = simpleGCN.output(exampleGraph); - System.out.println("output = " + output[0]); - - } - -} diff --git a/dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/featureextraction/FeatureTransformerTest.java b/dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/featureextraction/FeatureTransformerTest.java new file mode 100644 index 0000000..6a1534e --- /dev/null +++ b/dnet-feature-extraction/src/test/java/eu/dnetlib/deeplearning/featureextraction/FeatureTransformerTest.java @@ -0,0 +1,53 @@ +package eu.dnetlib.deeplearning.featureextraction; + +import eu.dnetlib.featureextraction.ScalaFeatureTransformer; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.linalg.DenseVector; +import org.apache.spark.ml.linalg.DenseVector$; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import scala.collection.JavaConversions; +import scala.collection.mutable.WrappedArray; + +import javax.xml.crypto.Data; +import java.io.IOException; +import java.util.Arrays; + +public class FeatureTransformerTest { + + static SparkSession spark; + static JavaSparkContext context; + static Dataset inputData; + static StructType inputSchema = new StructType(new StructField[]{ + new StructField("title", DataTypes.StringType, false, Metadata.empty()), + new StructField("abstract", DataTypes.StringType, false, Metadata.empty()) + }); + + @BeforeAll + public static void setup() throws IOException { + + spark = SparkSession + .builder() + .appName("Testing") + .master("local[*]") + .getOrCreate(); + + context = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + inputData = spark.createDataFrame(Arrays.asList( + RowFactory.create("article title 1", "article description 1"), + RowFactory.create("article title 2", "article description 2") + ), inputSchema); + + } + +} diff --git a/pom.xml b/pom.xml index 82b42eb..61f468f 100644 --- a/pom.xml +++ b/pom.xml @@ -94,6 +94,8 @@ false + + target @@ -305,31 +307,31 @@ com.fasterxml.jackson.core jackson-databind ${jackson.version} - provided + com.fasterxml.jackson.dataformat jackson-dataformat-xml ${jackson.version} - provided + com.fasterxml.jackson.module jackson-module-jsonSchema ${jackson.version} - provided + com.fasterxml.jackson.core jackson-core ${jackson.version} - provided + com.fasterxml.jackson.core jackson-annotations ${jackson.version} - provided + @@ -388,25 +390,25 @@ org.apache.spark spark-core_2.11 ${spark.version} - provided + org.apache.spark spark-graphx_2.11 ${spark.version} - provided + org.apache.spark spark-sql_2.11 ${spark.version} - provided + org.apache.spark spark-mllib_2.11 ${spark.version} - provided + org.junit.jupiter @@ -451,92 +453,79 @@ - - org.nd4j - ${nd4j.backend} - ${dl4j-master.version} - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - org.datavec - datavec-api - ${dl4j-master.version} - - - org.datavec - datavec-data-image - ${dl4j-master.version} - - - org.datavec - datavec-local - ${dl4j-master.version} - - - org.deeplearning4j - deeplearning4j-datasets - ${dl4j-master.version} - - - org.deeplearning4j - deeplearning4j-core - ${dl4j-master.version} + com.johnsnowlabs.nlp + spark-nlp_${scala.binary.version} + 2.7.5 - - org.deeplearning4j - resources - ${dl4j-master.version} - - - - org.deeplearning4j - deeplearning4j-ui - ${dl4j-master.version} - - - org.deeplearning4j - deeplearning4j-zoo - ${dl4j-master.version} - - - org.deeplearning4j - dl4j-spark-parameterserver_2.11 - ${dl4j-master.version} - - - org.deeplearning4j - dl4j-spark_2.11 - ${dl4j-master.version} - - - - - jfree - jfreechart - 1.0.13 - - - org.jfree - jcommon - 1.0.23 - - - org.deeplearning4j - deeplearning4j-datasets - ${dl4j-master.version} - - - - - eu.dnetlib - dnet-dedup-test - 4.1.13-SNAPSHOT - + + + + + -