dhp-graph-dump/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/EmitFromEntities.java


package eu.dnetlib.dhp.oa.graph.dump.skgif;

import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;

import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.*;
import scala.Tuple2;

/**
 * @author miriam.baglioni
 * @Date 06/02/24
 */
public class EmitFromEntities implements Serializable {

	private static final Logger log = LoggerFactory.getLogger(EmitFromEntities.class);

	public static void main(String[] args) throws Exception {
		String jsonConfiguration = IOUtils
			.toString(
				EmitFromEntities.class
					.getResourceAsStream(
						"/eu/dnetlib/dhp/oa/graph/dump/skgif/emit_biblio_parameters.json"));

		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
		parser.parseArgument(args);

		Boolean isSparkSessionManaged = Optional
			.ofNullable(parser.get("isSparkSessionManaged"))
			.map(Boolean::valueOf)
			.orElse(Boolean.TRUE);

		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

		final String inputPath = parser.get("sourcePath");
		log.info("inputPath: {}", inputPath);

		final String outputPath = parser.get("outputPath");
		log.info("outputPath: {}", outputPath);

		final String workingDir = parser.get("workingDir");
		log.info("workingDir: {}", workingDir);
		SparkConf conf = new SparkConf();

		runWithSparkSession(
			conf,
			isSparkSessionManaged,
			spark -> {
				Utils.removeOutputDir(spark, outputPath);
				emitFromResult(spark, inputPath, outputPath, workingDir);

			});
	}

	public static <R extends Result> void emitFromResult(SparkSession spark, String inputPath, String outputPath,
		String workingDir) {

		emitPerson(spark, inputPath, outputPath, workingDir);
		emitTopic(spark, inputPath, outputPath, workingDir);
		emitDatasourcePublisher(spark, inputPath, workingDir);

	}

	private static void emitDatasourcePublisher(SparkSession spark, String inputPath, String workingDir) {
		Dataset<Row> journalIds = spark
			.read()
			.schema(Encoders.bean(Datasource.class).schema())
			.json((inputPath + "datasource"))
			.filter(
				"datainfo.deletedbyinference !=true and " +
					"eoscdatasourcetype.classid == 'Journal archive' ")
			.select("id");

		Dataset<Publication> result = spark
			.read()
			.schema(Encoders.bean(Publication.class).schema())
			.json(inputPath + "publication")
			.filter("datainfo.deletedbyinference != true ")
			.as(Encoders.bean(Publication.class));

		Dataset<Row> datasourcePublisher = result.flatMap((FlatMapFunction<Publication, Tuple2<String, String>>) r -> {
			ArrayList<Tuple2<String, String>> dsPub = new ArrayList<>();
			if (Optional.ofNullable(r.getJournal()).isPresent() &&
				Optional.ofNullable(r.getPublisher()).isPresent()) {
				for (Instance i : r.getInstance())
					dsPub.add(new Tuple2<>(i.getHostedby().getKey(), r.getPublisher().getValue()));
			}
			return dsPub.iterator();
		}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
			.selectExpr("_1 as hostedby", "_2 as publisher");

		datasourcePublisher
			.join(journalIds, datasourcePublisher.col("hostedby").equalTo(journalIds.col("id")), "leftsemi")
			.distinct()
			.write()
			.mode(SaveMode.Overwrite)
			.option("compression", "gzip")
			.json(workingDir + "/datasourcePublisher");
	}

	private static <R extends Result> void emitTopic(SparkSession spark, String inputPath, String outputPath,
		String workingDir) {
		ModelSupport.entityTypes.keySet().forEach(e -> {
			if (ModelSupport.isResult(e)) {
				Class<R> resultClazz = ModelSupport.entityTypes.get(e);
				Utils
					.readPath(spark, inputPath + e.name(), resultClazz)
					.filter((FilterFunction<R>) r -> Optional.ofNullable(r.getSubject()).isPresent())
					.flatMap(
						(FlatMapFunction<R, Topic>) r -> r
							.getSubject()
							.stream()
							.filter(
								s -> s.getQualifier().getClassid().equalsIgnoreCase("fos")
									|| s.getQualifier().getClassid().equalsIgnoreCase("sdg")
									|| s.getQualifier().getClassid().equalsIgnoreCase("keyword"))
							.map(s -> {
								Topic t = new Topic();
								t
									.setLocal_identifier(
										Utils
											.getIdentifier(
												Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
								t
									.setIdentifiers(
										Arrays
											.asList(
												Identifier.newInstance(s.getQualifier().getClassid(), s.getValue())));
								t.setName(s.getValue());
								return t;
							})
							.collect(Collectors.toList())
							.iterator(),
						Encoders.bean(Topic.class))
					.write()
					.mode(SaveMode.Overwrite)
					.option("compression", "gzip")
					.json(workingDir + e.name() + "/topic");
			}
		});
		Dataset<Topic> topics = spark.emptyDataset(Encoders.bean(Topic.class));

		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
			if (ModelSupport.isResult(entityType))
				topics = topics.union(Utils.readPath(spark, workingDir + entityType.name() + "/topic", Topic.class));
		}
		topics
			.groupByKey((MapFunction<Topic, String>) p -> p.getLocal_identifier(), Encoders.STRING())
			.mapGroups((MapGroupsFunction<String, Topic, Topic>) (k, v) -> v.next(), Encoders.bean(Topic.class))
			.write()
			.mode(SaveMode.Overwrite)
			.option("compression", "gzip")
			.json(outputPath + "/topics");

	}

	private static <R extends Result> void emitPerson(SparkSession spark, String inputPath, String outputPath,
		String workingDir) {
		ModelSupport.entityTypes.keySet().forEach(e -> {
			if (ModelSupport.isResult(e)) {
				Class<R> resultClazz = ModelSupport.entityTypes.get(e);
				Utils
					.readPath(spark, inputPath + e.name(), resultClazz)
					.flatMap((FlatMapFunction<R, Persons>) r -> {
						List<Persons> authors = new ArrayList<>();

						if (Optional.ofNullable(r.getAuthor()).isPresent() && r.getAuthor().size() > 0) {
							int count = 0;
							for (Author a : r.getAuthor()) {
								count += 1;
								Persons p = new Persons();
								p.setFamily_name(a.getSurname());
								p.setGiven_name(a.getName());
								p.setFullname(a.getFullname());
								String identifier = new String();
								if (Optional.ofNullable(a.getPid()).isPresent()) {
									Tuple2<String, Boolean> orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils
										.getOrcid(a.getPid());
									if (orcid != null) {
										identifier = Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2());
										if (orcid._2())
											p
												.setIdentifiers(
													Arrays.asList(Identifier.newInstance("orcid", orcid._1())));
										else
											p
												.setIdentifiers(
													Arrays
														.asList(Identifier.newInstance("inferred_orcid", orcid._1())));
									} else {
										if (Optional.ofNullable(a.getRank()).isPresent()) {
											identifier = Utils
												.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + a.getRank());
										} else {
											identifier = Utils
												.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + count);
										}

									}
								}
								p.setLocal_identifier(identifier);
								authors.add(p);
							}

						}
						return authors.iterator();
					}, Encoders.bean(Persons.class))
					.filter((FilterFunction<Persons>) p -> p != null)
					.write()
					.mode(SaveMode.Overwrite)
					.option("compression", "gzip")
					.json(workingDir + e.name() + "/person");
			}
		});
		Dataset<Persons> persons = spark.emptyDataset(Encoders.bean(Persons.class));

		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
			if (ModelSupport.isResult(entityType))
				persons = persons
					.union(Utils.readPath(spark, workingDir + entityType.name() + "/person", Persons.class));
		}
		persons
			.groupByKey((MapFunction<Persons, String>) p -> p.getLocal_identifier(), Encoders.STRING())
			.mapGroups((MapGroupsFunction<String, Persons, Persons>) (k, v) -> v.next(), Encoders.bean(Persons.class))
			.write()
			.mode(SaveMode.Overwrite)
			.option("compression", "gzip")
			.json(outputPath + "/persons");

	}

}
- 2024-02-20 09:57:33 +01:00
[SKG-IF] mapping to version latest in date 27 february 2024 2024-02-27 12:35:34 +01:00			`package eu.dnetlib.dhp.oa.graph.dump.skgif;`
- 2024-02-20 09:57:33 +01:00
			`import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;`

			`import java.io.Serializable;`
			`import java.util.*;`
			`import java.util.stream.Collectors;`

			`import org.apache.commons.io.IOUtils;`
			`import org.apache.spark.SparkConf;`
			`import org.apache.spark.api.java.function.FilterFunction;`
			`import org.apache.spark.api.java.function.FlatMapFunction;`
			`import org.apache.spark.api.java.function.MapFunction;`
			`import org.apache.spark.api.java.function.MapGroupsFunction;`
[SKG-IF] tries to make the process finish. need to change the strategy 2024-03-16 08:44:10 +01:00			`import org.apache.spark.sql.*;`
- 2024-02-20 09:57:33 +01:00			`import org.apache.spark.sql.Dataset;`
			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`

			`import eu.dnetlib.dhp.application.ArgumentApplicationParser;`
			`import eu.dnetlib.dhp.schema.common.EntityType;`
			`import eu.dnetlib.dhp.schema.common.ModelSupport;`
			`import eu.dnetlib.dhp.schema.oaf.*;`
[SKG-IF] refactoring and fixing issues 2024-03-01 09:35:15 +01:00			`import eu.dnetlib.dhp.schema.oaf.Datasource;`
- 2024-02-20 09:57:33 +01:00			`import eu.dnetlib.dhp.skgif.model.*;`
			`import scala.Tuple2;`

			`/**`
			`* @author miriam.baglioni`
			`* @Date 06/02/24`
			`*/`
[SKG-IF] added first implementation for denormalization 2024-03-04 16:28:52 +01:00			`public class EmitFromEntities implements Serializable {`
- 2024-02-20 09:57:33 +01:00
[SKG-IF] added first implementation for denormalization 2024-03-04 16:28:52 +01:00			`private static final Logger log = LoggerFactory.getLogger(EmitFromEntities.class);`
- 2024-02-20 09:57:33 +01:00
			`public static void main(String[] args) throws Exception {`
			`String jsonConfiguration = IOUtils`
			`.toString(`
[SKG-IF] added first implementation for denormalization 2024-03-04 16:28:52 +01:00			`EmitFromEntities.class`
- 2024-02-20 09:57:33 +01:00			`.getResourceAsStream(`
[SKG-IF] selection of subset of relevant results from the set provided via input 2024-03-13 15:22:56 +01:00			`"/eu/dnetlib/dhp/oa/graph/dump/skgif/emit_biblio_parameters.json"));`
- 2024-02-20 09:57:33 +01:00
			`final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);`
			`parser.parseArgument(args);`

			`Boolean isSparkSessionManaged = Optional`
			`.ofNullable(parser.get("isSparkSessionManaged"))`
			`.map(Boolean::valueOf)`
			`.orElse(Boolean.TRUE);`

			`log.info("isSparkSessionManaged: {}", isSparkSessionManaged);`

			`final String inputPath = parser.get("sourcePath");`
			`log.info("inputPath: {}", inputPath);`

			`final String outputPath = parser.get("outputPath");`
			`log.info("outputPath: {}", outputPath);`

			`final String workingDir = parser.get("workingDir");`
			`log.info("workingDir: {}", workingDir);`
			`SparkConf conf = new SparkConf();`

			`runWithSparkSession(`
			`conf,`
			`isSparkSessionManaged,`
			`spark -> {`
			`Utils.removeOutputDir(spark, outputPath);`
			`emitFromResult(spark, inputPath, outputPath, workingDir);`
[SKG-IF] added first implementation for denormalization 2024-03-04 16:28:52 +01:00
- 2024-02-20 09:57:33 +01:00			`});`
			`}`
[SKG-IF-EOSC] added new indicators field in the product plus the scheme to the denormalized topic in the result. Added provenance classname to the provenance field in the topic in the result 2024-04-05 12:59:41 +02:00
- 2024-02-20 09:57:33 +01:00			`public static <R extends Result> void emitFromResult(SparkSession spark, String inputPath, String outputPath,`
			`String workingDir) {`
[SKG-IF-EOSC] added id for the eosc datasource at the level of the materialization 2024-03-27 09:45:13 +01:00
- 2024-02-20 09:57:33 +01:00			`emitPerson(spark, inputPath, outputPath, workingDir);`
			`emitTopic(spark, inputPath, outputPath, workingDir);`
[SKG-IF] changed the implementation to reduce the number of needed joins. Reduced the memory usage by defining specific flat beans 2024-03-18 09:30:05 +01:00			`emitDatasourcePublisher(spark, inputPath, workingDir);`
[SKG-IF] added first implementation for denormalization 2024-03-04 16:28:52 +01:00
			`}`
[SKG-IF-EOSC] added new indicators field in the product plus the scheme to the denormalized topic in the result. Added provenance classname to the provenance field in the topic in the result 2024-04-05 12:59:41 +02:00
[SKG-IF] changed the implementation to reduce the number of needed joins. Reduced the memory usage by defining specific flat beans 2024-03-18 09:30:05 +01:00			`private static void emitDatasourcePublisher(SparkSession spark, String inputPath, String workingDir) {`
			`Dataset<Row> journalIds = spark`
			`.read()`
			`.schema(Encoders.bean(Datasource.class).schema())`
			`.json((inputPath + "datasource"))`
			`.filter(`
[SKG-IF denormalized] refactoring 2024-03-26 11:45:59 +01:00			`"datainfo.deletedbyinference !=true and " +`
[SKG-IF] changed the implementation to reduce the number of needed joins. Reduced the memory usage by defining specific flat beans 2024-03-18 09:30:05 +01:00			`"eoscdatasourcetype.classid == 'Journal archive' ")`
			`.select("id");`

			`Dataset<Publication> result = spark`
			`.read()`
			`.schema(Encoders.bean(Publication.class).schema())`
			`.json(inputPath + "publication")`
			`.filter("datainfo.deletedbyinference != true ")`
			`.as(Encoders.bean(Publication.class));`

			`Dataset<Row> datasourcePublisher = result.flatMap((FlatMapFunction<Publication, Tuple2<String, String>>) r -> {`
			`ArrayList<Tuple2<String, String>> dsPub = new ArrayList<>();`
			`if (Optional.ofNullable(r.getJournal()).isPresent() &&`
			`Optional.ofNullable(r.getPublisher()).isPresent()) {`
			`for (Instance i : r.getInstance())`
			`dsPub.add(new Tuple2<>(i.getHostedby().getKey(), r.getPublisher().getValue()));`
			`}`
			`return dsPub.iterator();`
			`}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))`
			`.selectExpr("_1 as hostedby", "_2 as publisher");`

			`datasourcePublisher`
			`.join(journalIds, datasourcePublisher.col("hostedby").equalTo(journalIds.col("id")), "leftsemi")`
			`.distinct()`
			`.write()`
			`.mode(SaveMode.Overwrite)`
			`.option("compression", "gzip")`
			`.json(workingDir + "/datasourcePublisher");`
			`}`

- 2024-02-20 09:57:33 +01:00			`private static <R extends Result> void emitTopic(SparkSession spark, String inputPath, String outputPath,`
			`String workingDir) {`
			`ModelSupport.entityTypes.keySet().forEach(e -> {`
			`if (ModelSupport.isResult(e)) {`
			`Class<R> resultClazz = ModelSupport.entityTypes.get(e);`
			`Utils`
			`.readPath(spark, inputPath + e.name(), resultClazz)`
[SKG-IF] refactoring and fixing issues 2024-03-01 09:35:15 +01:00			`.filter((FilterFunction<R>) r -> Optional.ofNullable(r.getSubject()).isPresent())`
- 2024-02-20 09:57:33 +01:00			`.flatMap(`
			`(FlatMapFunction<R, Topic>) r -> r`
			`.getSubject()`
			`.stream()`
[SKG-IF] refactoring and fixing issues 2024-03-01 09:35:15 +01:00			`.filter(`
			`s -> s.getQualifier().getClassid().equalsIgnoreCase("fos")`
[SKG-IF-EOSC] added new indicators field in the product plus the scheme to the denormalized topic in the result. Added provenance classname to the provenance field in the topic in the result 2024-04-05 12:59:41 +02:00			`\|\| s.getQualifier().getClassid().equalsIgnoreCase("sdg")`
			`\|\| s.getQualifier().getClassid().equalsIgnoreCase("keyword"))`
- 2024-02-20 09:57:33 +01:00			`.map(s -> {`
			`Topic t = new Topic();`
			`t`
			`.setLocal_identifier(`
[SKG-IF] refactoring and fixing issues 2024-03-01 09:35:15 +01:00			`Utils`
			`.getIdentifier(`
			`Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));`
- 2024-02-20 09:57:33 +01:00			`t`
			`.setIdentifiers(`
			`Arrays`
			`.asList(`
[SKG-IF] mapping to version latest in date 27 february 2024 2024-02-27 12:35:34 +01:00			`Identifier.newInstance(s.getQualifier().getClassid(), s.getValue())));`
- 2024-02-20 09:57:33 +01:00			`t.setName(s.getValue());`
			`return t;`
			`})`
			`.collect(Collectors.toList())`
			`.iterator(),`
			`Encoders.bean(Topic.class))`
			`.write()`
			`.mode(SaveMode.Overwrite)`
			`.option("compression", "gzip")`
			`.json(workingDir + e.name() + "/topic");`
			`}`
			`});`
			`Dataset<Topic> topics = spark.emptyDataset(Encoders.bean(Topic.class));`

			`for (EntityType entityType : ModelSupport.entityTypes.keySet()) {`
			`if (ModelSupport.isResult(entityType))`
			`topics = topics.union(Utils.readPath(spark, workingDir + entityType.name() + "/topic", Topic.class));`
			`}`
			`topics`
			`.groupByKey((MapFunction<Topic, String>) p -> p.getLocal_identifier(), Encoders.STRING())`
			`.mapGroups((MapGroupsFunction<String, Topic, Topic>) (k, v) -> v.next(), Encoders.bean(Topic.class))`
			`.write()`
			`.mode(SaveMode.Overwrite)`
			`.option("compression", "gzip")`
[SKG-IF denormalized] refactoring 2024-03-26 11:45:59 +01:00			`.json(outputPath + "/topics");`
- 2024-02-20 09:57:33 +01:00
			`}`

			`private static <R extends Result> void emitPerson(SparkSession spark, String inputPath, String outputPath,`
			`String workingDir) {`
			`ModelSupport.entityTypes.keySet().forEach(e -> {`
			`if (ModelSupport.isResult(e)) {`
			`Class<R> resultClazz = ModelSupport.entityTypes.get(e);`
			`Utils`
			`.readPath(spark, inputPath + e.name(), resultClazz)`
			`.flatMap((FlatMapFunction<R, Persons>) r -> {`
			`List<Persons> authors = new ArrayList<>();`

[SKG-IF] mapping to version latest in date 27 february 2024 2024-02-27 12:35:34 +01:00			`if (Optional.ofNullable(r.getAuthor()).isPresent() && r.getAuthor().size() > 0) {`
- 2024-02-20 09:57:33 +01:00			`int count = 0;`
			`for (Author a : r.getAuthor()) {`
			`count += 1;`
			`Persons p = new Persons();`
[SKG-IF] mapping to version latest in date 27 february 2024 2024-02-27 12:35:34 +01:00			`p.setFamily_name(a.getSurname());`
			`p.setGiven_name(a.getName());`
[SKG-IF] denormalization fixing issue and new properties 2024-03-12 14:51:14 +01:00			`p.setFullname(a.getFullname());`
- 2024-02-20 09:57:33 +01:00			`String identifier = new String();`
			`if (Optional.ofNullable(a.getPid()).isPresent()) {`
[SKG-IF] refactoring and fixing issues 2024-03-01 09:35:15 +01:00			`Tuple2<String, Boolean> orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils`
			`.getOrcid(a.getPid());`
- 2024-02-20 09:57:33 +01:00			`if (orcid != null) {`
[SKG-IF] mapping to version latest in date 27 february 2024 2024-02-27 12:35:34 +01:00			`identifier = Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2());`
- 2024-02-20 09:57:33 +01:00			`if (orcid._2())`
			`p`
			`.setIdentifiers(`
			`Arrays.asList(Identifier.newInstance("orcid", orcid._1())));`
			`else`
			`p`
			`.setIdentifiers(`
[SKG-IF] refactoring and fixing issues 2024-03-01 09:35:15 +01:00			`Arrays`
			`.asList(Identifier.newInstance("inferred_orcid", orcid._1())));`
- 2024-02-20 09:57:33 +01:00			`} else {`
			`if (Optional.ofNullable(a.getRank()).isPresent()) {`
[SKG-IF] refactoring and fixing issues 2024-03-01 09:35:15 +01:00			`identifier = Utils`
			`.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + a.getRank());`
- 2024-02-20 09:57:33 +01:00			`} else {`
[SKG-IF] refactoring and fixing issues 2024-03-01 09:35:15 +01:00			`identifier = Utils`
			`.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + count);`
- 2024-02-20 09:57:33 +01:00			`}`

			`}`
			`}`
[SKG-IF] mapping to version latest in date 27 february 2024 2024-02-27 12:35:34 +01:00			`p.setLocal_identifier(identifier);`
			`authors.add(p);`
- 2024-02-20 09:57:33 +01:00			`}`

			`}`
			`return authors.iterator();`
			`}, Encoders.bean(Persons.class))`
[SKG-IF] denormalization fixing issue and adding new field to mingrant 2024-03-11 09:56:40 +01:00			`.filter((FilterFunction<Persons>) p -> p != null)`
- 2024-02-20 09:57:33 +01:00			`.write()`
			`.mode(SaveMode.Overwrite)`
			`.option("compression", "gzip")`
			`.json(workingDir + e.name() + "/person");`
			`}`
			`});`
			`Dataset<Persons> persons = spark.emptyDataset(Encoders.bean(Persons.class));`

			`for (EntityType entityType : ModelSupport.entityTypes.keySet()) {`
			`if (ModelSupport.isResult(entityType))`
			`persons = persons`
			`.union(Utils.readPath(spark, workingDir + entityType.name() + "/person", Persons.class));`
			`}`
			`persons`
[SKG-IF] mapping to version latest in date 27 february 2024 2024-02-27 12:35:34 +01:00			`.groupByKey((MapFunction<Persons, String>) p -> p.getLocal_identifier(), Encoders.STRING())`
- 2024-02-20 09:57:33 +01:00			`.mapGroups((MapGroupsFunction<String, Persons, Persons>) (k, v) -> v.next(), Encoders.bean(Persons.class))`
			`.write()`
			`.mode(SaveMode.Overwrite)`
			`.option("compression", "gzip")`
[SKG-IF denormalized] refactoring 2024-03-26 11:45:59 +01:00			`.json(outputPath + "/persons");`
- 2024-02-20 09:57:33 +01:00
			`}`

			`}`