dnet-dedup/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkComputeStatistics.java

package eu.dnetlib.jobs;

import eu.dnetlib.Deduper;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser;
import eu.dnetlib.support.Block;
import eu.dnetlib.support.ConnectedComponent;
import eu.dnetlib.support.Relation;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;

import java.io.IOException;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;

public class SparkComputeStatistics extends AbstractSparkJob {

        private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.jobs.SparkComputeStatistics.class);

        public SparkComputeStatistics(ArgumentApplicationParser parser, SparkSession spark) {
            super(parser, spark);
        }

        public static void main(String[] args) throws Exception {

            ArgumentApplicationParser parser = new ArgumentApplicationParser(
                    readResource("/jobs/parameters/computeStatistics_parameters.json", eu.dnetlib.jobs.SparkCreateSimRels.class)
            );

            parser.parseArgument(args);

            SparkConf conf = new SparkConf();

            new eu.dnetlib.jobs.SparkComputeStatistics(
                    parser,
                    getSparkSession(conf)
            ).run();
        }

        @Override
        public void run() throws IOException {
            //https://towardsdatascience.com/7-evaluation-metrics-for-clustering-algorithms-bdc537ff54d2#:~:text=There%20are%20two%20types%20of,to%20all%20unsupervised%20learning%20results)
            // read oozie parameters
            final String entitiesPath = parser.get("entitiesPath");
            final String workingPath = parser.get("workingPath");
            final String dedupConfPath = parser.get("dedupConfPath");
            final String groundTruthFieldJPath = parser.get("groundTruthFieldJPath");
            final int numPartitions = Optional
                    .ofNullable(parser.get("numPartitions"))
                    .map(Integer::valueOf)
                    .orElse(NUM_PARTITIONS);

            log.info("entitiesPath:          '{}'", entitiesPath);
            log.info("workingPath:           '{}'", workingPath);
            log.info("numPartitions:         '{}'", numPartitions);
            log.info("dedupConfPath:         '{}'", dedupConfPath);
            log.info("groundTruthFieldJPath: '{}'", groundTruthFieldJPath);

            JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
            DedupConfig dedupConfig = loadDedupConfig(dedupConfPath);

            JavaPairRDD<String, MapDocument> mapDocuments = sc
                    .textFile(entitiesPath)
                    .repartition(numPartitions)
                    .mapToPair(
                            (PairFunction<String, String, MapDocument>) s -> {
                                MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, s);
                                //put in the map the groundTruthField used to compute statistics
                                d.getFieldMap().put("groundTruth", new FieldValueImpl(Type.String, "groundTruth", MapDocumentUtil.getJPathString(groundTruthFieldJPath, s)));
                                return new Tuple2<>(d.getIdentifier(), d);
                            });

            JavaRDD<String> entities = mapDocuments.map(d -> d._2().getFieldMap().get("groundTruth").stringValue());

            // create blocks
            JavaRDD<List<String>> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig)
                    .map(b -> b._2().getDocuments().stream().map(d -> d.getFieldMap().get("groundTruth").stringValue()).collect(Collectors.toList()));

            // <source, target>: source is the dedup_id, target is the id of the mergedIn
            JavaRDD<Relation> mergerels = spark
                    .read()
                    .load(workingPath + "/mergerels")
                    .as(Encoders.bean(Relation.class))
                    .toJavaRDD();

            JavaRDD<Relation> simrels = spark
                    .read()
                    .load(workingPath + "/simrels")
                    .as(Encoders.bean(Relation.class))
                    .toJavaRDD();

            JavaRDD<List<String>> groups = sc.textFile(workingPath + "/groupentities")
                    .map(e -> new ObjectMapper().readValue(e, ConnectedComponent.class))
                    .map(e -> e.getDocs().stream().map(d -> MapDocumentUtil.getJPathString(groundTruthFieldJPath, d)).collect(Collectors.toList()));

            long entities_number = entities.count();
            long blocks_number = blocks.count();
            double blocks_randIndex = randIndex(blocks);
            long simrels_number = simrels.count();
            long mergerels_number = mergerels.count();
            double groups_randIndex = randIndex(groups);
            long groups_number = groups.count();
            long groundtruth_number = entities.filter(e -> !e.isEmpty()).count();
            long correct_groups = groups.filter(x -> x.stream().distinct().count()==1).count();
            long wrong_groups = groups_number - correct_groups;

            String print =
                    "Entities : " + entities_number + "\n" +
                    "Ground Truth : " + groundtruth_number + "\n" +
                    "Blocks : " + blocks_number + "\n" +
                    "Blocks RI : " + blocks_randIndex + "\n" +
                    "SimRels : " + simrels_number + "\n" +
                    "MergeRels : " + mergerels_number + "\n" +
                    "Groups : " + groups_number + " (correct: " + correct_groups + ", wrong: " + wrong_groups + ")\n" +
                    "Groups RI : " + groups_randIndex;

            System.out.println(print);

            writeStatsFileToHDFS(groundtruth_number, entities_number, blocks_randIndex, groups_randIndex, blocks_number, simrels_number, mergerels_number, groups_number, workingPath + "/stats_file.txt");

        }

        public static void writeStatsFileToHDFS(long groundtruth_number, long entities_number, double blocks_randIndex, double groups_randIndex, long blocks_number, long simrels_number, long mergerels_number, long groups_number, String filePath) throws IOException {
            Configuration conf = new Configuration();

            FileSystem fs = FileSystem.get(conf);
            fs.delete(new Path(filePath), true);

            try {
                fs = FileSystem.get(conf);

                Path outFile = new Path(filePath);
                // Verification
                if (fs.exists(outFile)) {
                    System.out.println("Output file already exists");
                    throw new IOException("Output file already exists");
                }

                String print =
                        "Entities : " + entities_number + "\n" +
                        "Ground Truth : " + groundtruth_number + "\n" +
                        "Blocks : " + blocks_number + "\n" +
                        "Blocks RI : " + blocks_randIndex + "\n" +
                        "SimRels : " + simrels_number + "\n" +
                        "MergeRels : " + mergerels_number + "\n" +
                        "Groups : " + groups_number + "\n" +
                        "Groups RI : " + groups_randIndex;

                // Create file to write
                FSDataOutputStream out = fs.create(outFile);
                try{
                    out.writeBytes(print);
                }
                finally {
                    out.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        //TODO find another maesure that takes into account all the elements outside of the group too
        //RandIndex = number of pairwise correct predictions/total number of possible pairs (in the same cluster) -> bounded between 0 and 1
        public double randIndex(JavaRDD<List<String>> clusters) {

            Tuple2<Integer, Integer> reduce = clusters.map(c -> {
                        int num = 0;
                        for (String id : c.stream().distinct().filter(s -> !s.isEmpty()).collect(Collectors.toList())) {
                            int n = (int) c.stream().filter(i -> i.equals(id)).count();
                            num += binomialCoefficient(n);
                        }
                        int den = binomialCoefficient(c.size());
                        return new Tuple2<>(num, den);
                    })
                    .reduce((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2()));

            return (double)reduce._1()/ reduce._2();
        }

        private static int binomialCoefficient(int n)
        {
            return n*(n-1)/2;
        }

        //V-measure = harmonic mean of homogeneity and completeness, homogeneity = each cluster contains only members of a single class, completeness = all members of a given class are assigned to the same cluster

}
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`package eu.dnetlib.jobs;`

implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`import eu.dnetlib.Deduper;`
			`import eu.dnetlib.pace.config.DedupConfig;`
			`import eu.dnetlib.pace.config.Type;`
			`import eu.dnetlib.pace.model.FieldValueImpl;`
			`import eu.dnetlib.pace.model.MapDocument;`
			`import eu.dnetlib.pace.util.MapDocumentUtil;`
			`import eu.dnetlib.pace.utils.Utility;`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`import eu.dnetlib.support.ArgumentApplicationParser;`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`import eu.dnetlib.support.Block;`
			`import eu.dnetlib.support.ConnectedComponent;`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`import eu.dnetlib.support.Relation;`
			`import org.apache.hadoop.conf.Configuration;`
			`import org.apache.hadoop.fs.FSDataOutputStream;`
			`import org.apache.hadoop.fs.FileSystem;`
			`import org.apache.hadoop.fs.Path;`
			`import org.apache.spark.SparkConf;`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`import org.apache.spark.api.java.JavaPairRDD;`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`import org.apache.spark.api.java.JavaRDD;`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`import org.apache.spark.api.java.JavaSparkContext;`
			`import org.apache.spark.api.java.function.PairFunction;`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`import org.apache.spark.sql.Encoders;`
			`import org.apache.spark.sql.SparkSession;`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`import org.codehaus.jackson.map.ObjectMapper;`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`import scala.Tuple2;`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00
			`import java.io.IOException;`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`import java.util.List;`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`import java.util.Optional;`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`import java.util.stream.Collectors;`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00
			`public class SparkComputeStatistics extends AbstractSparkJob {`

			`private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.jobs.SparkComputeStatistics.class);`

			`public SparkComputeStatistics(ArgumentApplicationParser parser, SparkSession spark) {`
			`super(parser, spark);`
			`}`

			`public static void main(String[] args) throws Exception {`

			`ArgumentApplicationParser parser = new ArgumentApplicationParser(`
			`readResource("/jobs/parameters/computeStatistics_parameters.json", eu.dnetlib.jobs.SparkCreateSimRels.class)`
			`);`

			`parser.parseArgument(args);`

			`SparkConf conf = new SparkConf();`

			`new eu.dnetlib.jobs.SparkComputeStatistics(`
			`parser,`
			`getSparkSession(conf)`
			`).run();`
			`}`

			`@Override`
			`public void run() throws IOException {`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`//https://towardsdatascience.com/7-evaluation-metrics-for-clustering-algorithms-bdc537ff54d2#:~:text=There%20are%20two%20types%20of,to%20all%20unsupervised%20learning%20results)`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`// read oozie parameters`
			`final String entitiesPath = parser.get("entitiesPath");`
			`final String workingPath = parser.get("workingPath");`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`final String dedupConfPath = parser.get("dedupConfPath");`
			`final String groundTruthFieldJPath = parser.get("groundTruthFieldJPath");`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`final int numPartitions = Optional`
			`.ofNullable(parser.get("numPartitions"))`
			`.map(Integer::valueOf)`
			`.orElse(NUM_PARTITIONS);`

implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`log.info("entitiesPath: '{}'", entitiesPath);`
			`log.info("workingPath: '{}'", workingPath);`
			`log.info("numPartitions: '{}'", numPartitions);`
			`log.info("dedupConfPath: '{}'", dedupConfPath);`
			`log.info("groundTruthFieldJPath: '{}'", groundTruthFieldJPath);`

			`JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());`
			`DedupConfig dedupConfig = loadDedupConfig(dedupConfPath);`

			`JavaPairRDD<String, MapDocument> mapDocuments = sc`
			`.textFile(entitiesPath)`
			`.repartition(numPartitions)`
			`.mapToPair(`
			`(PairFunction<String, String, MapDocument>) s -> {`
			`MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, s);`
			`//put in the map the groundTruthField used to compute statistics`
			`d.getFieldMap().put("groundTruth", new FieldValueImpl(Type.String, "groundTruth", MapDocumentUtil.getJPathString(groundTruthFieldJPath, s)));`
			`return new Tuple2<>(d.getIdentifier(), d);`
			`});`

			`JavaRDD<String> entities = mapDocuments.map(d -> d._2().getFieldMap().get("groundTruth").stringValue());`

			`// create blocks`
			`JavaRDD<List<String>> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig)`
			`.map(b -> b._2().getDocuments().stream().map(d -> d.getFieldMap().get("groundTruth").stringValue()).collect(Collectors.toList()));`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00
			`// <source, target>: source is the dedup_id, target is the id of the mergedIn`
			`JavaRDD<Relation> mergerels = spark`
			`.read()`
			`.load(workingPath + "/mergerels")`
			`.as(Encoders.bean(Relation.class))`
			`.toJavaRDD();`

			`JavaRDD<Relation> simrels = spark`
			`.read()`
			`.load(workingPath + "/simrels")`
			`.as(Encoders.bean(Relation.class))`
			`.toJavaRDD();`

implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`JavaRDD<List<String>> groups = sc.textFile(workingPath + "/groupentities")`
			`.map(e -> new ObjectMapper().readValue(e, ConnectedComponent.class))`
			`.map(e -> e.getDocs().stream().map(d -> MapDocumentUtil.getJPathString(groundTruthFieldJPath, d)).collect(Collectors.toList()));`

			`long entities_number = entities.count();`
			`long blocks_number = blocks.count();`
			`double blocks_randIndex = randIndex(blocks);`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`long simrels_number = simrels.count();`
			`long mergerels_number = mergerels.count();`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`double groups_randIndex = randIndex(groups);`
			`long groups_number = groups.count();`
			`long groundtruth_number = entities.filter(e -> !e.isEmpty()).count();`
			`long correct_groups = groups.filter(x -> x.stream().distinct().count()==1).count();`
			`long wrong_groups = groups_number - correct_groups;`

			`String print =`
			`"Entities : " + entities_number + "\n" +`
			`"Ground Truth : " + groundtruth_number + "\n" +`
			`"Blocks : " + blocks_number + "\n" +`
			`"Blocks RI : " + blocks_randIndex + "\n" +`
			`"SimRels : " + simrels_number + "\n" +`
			`"MergeRels : " + mergerels_number + "\n" +`
			`"Groups : " + groups_number + " (correct: " + correct_groups + ", wrong: " + wrong_groups + ")\n" +`
			`"Groups RI : " + groups_randIndex;`

			`System.out.println(print);`

			`writeStatsFileToHDFS(groundtruth_number, entities_number, blocks_randIndex, groups_randIndex, blocks_number, simrels_number, mergerels_number, groups_number, workingPath + "/stats_file.txt");`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00
			`}`

implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`public static void writeStatsFileToHDFS(long groundtruth_number, long entities_number, double blocks_randIndex, double groups_randIndex, long blocks_number, long simrels_number, long mergerels_number, long groups_number, String filePath) throws IOException {`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`Configuration conf = new Configuration();`

			`FileSystem fs = FileSystem.get(conf);`
			`fs.delete(new Path(filePath), true);`

			`try {`
			`fs = FileSystem.get(conf);`

			`Path outFile = new Path(filePath);`
			`// Verification`
			`if (fs.exists(outFile)) {`
			`System.out.println("Output file already exists");`
			`throw new IOException("Output file already exists");`
			`}`

			`String print =`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00			`"Entities : " + entities_number + "\n" +`
			`"Ground Truth : " + groundtruth_number + "\n" +`
			`"Blocks : " + blocks_number + "\n" +`
			`"Blocks RI : " + blocks_randIndex + "\n" +`
			`"SimRels : " + simrels_number + "\n" +`
			`"MergeRels : " + mergerels_number + "\n" +`
			`"Groups : " + groups_number + "\n" +`
			`"Groups RI : " + groups_randIndex;`
implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00
			`// Create file to write`
			`FSDataOutputStream out = fs.create(outFile);`
			`try{`
			`out.writeBytes(print);`
			`}`
			`finally {`
			`out.close();`
			`}`
			`} catch (IOException e) {`
			`e.printStackTrace();`
			`}`
			`}`
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00
			`//TODO find another maesure that takes into account all the elements outside of the group too`
			`//RandIndex = number of pairwise correct predictions/total number of possible pairs (in the same cluster) -> bounded between 0 and 1`
			`public double randIndex(JavaRDD<List<String>> clusters) {`

			`Tuple2<Integer, Integer> reduce = clusters.map(c -> {`
			`int num = 0;`
			`for (String id : c.stream().distinct().filter(s -> !s.isEmpty()).collect(Collectors.toList())) {`
			`int n = (int) c.stream().filter(i -> i.equals(id)).count();`
			`num += binomialCoefficient(n);`
			`}`
			`int den = binomialCoefficient(c.size());`
			`return new Tuple2<>(num, den);`
			`})`
			`.reduce((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2()));`

			`return (double)reduce._1()/ reduce._2();`
			`}`

			`private static int binomialCoefficient(int n)`
			`{`
			`return n*(n-1)/2;`
			`}`

			`//V-measure = harmonic mean of homogeneity and completeness, homogeneity = each cluster contains only members of a single class, completeness = all members of a given class are assigned to the same cluster`

implementation of the wf to dedup entities, addition of the module to run the wf on the cluster 2020-12-04 15:41:31 +01:00			`}`