code formatting

This commit is contained in:
Claudio Atzori 2023-11-23 16:33:24 +01:00
parent a0311e8a90
commit 1763d377ad
6 changed files with 274 additions and 275 deletions

View File

@ -18,7 +18,6 @@ package eu.dnetlib.pace.util;
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* Diff Match and Patch * Diff Match and Patch
* Copyright 2018 The diff-match-patch Authors. * Copyright 2018 The diff-match-patch Authors.

View File

@ -79,8 +79,8 @@ public class PrepareAffiliationRelationsTest {
.getPath(); .getPath();
String pubmedAffiliationRelationsPath = getClass() String pubmedAffiliationRelationsPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json") .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath(); .getPath();
String outputPath = workingDir.toString() + "/actionSet"; String outputPath = workingDir.toString() + "/actionSet";

View File

@ -31,94 +31,94 @@ import scala.Tuple2;
public class PrepareResultCommunitySet { public class PrepareResultCommunitySet {
private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySet.class); private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySet.class);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
PrepareResultCommunitySet.class PrepareResultCommunitySet.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json")); "/eu/dnetlib/dhp/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser); Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath"); String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final boolean production = Boolean.valueOf(parser.get("production")); final boolean production = Boolean.valueOf(parser.get("production"));
log.info("production: {}", production); log.info("production: {}", production);
final CommunityEntityMap projectsMap = Utils.getCommunityProjects(production); final CommunityEntityMap projectsMap = Utils.getCommunityProjects(production);
// log.info("projectsMap: {}", new Gson().toJson(projectsMap)); // log.info("projectsMap: {}", new Gson().toJson(projectsMap));
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, outputPath);
prepareInfo(spark, inputPath, outputPath, projectsMap); prepareInfo(spark, inputPath, outputPath, projectsMap);
}); });
} }
private static void prepareInfo( private static void prepareInfo(
SparkSession spark, SparkSession spark,
String inputPath, String inputPath,
String outputPath, String outputPath,
CommunityEntityMap projectMap) { CommunityEntityMap projectMap) {
final StructType structureSchema = new StructType() final StructType structureSchema = new StructType()
.add( .add(
"dataInfo", new StructType() "dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType) .add("deletedbyinference", DataTypes.BooleanType)
.add("invisible", DataTypes.BooleanType)) .add("invisible", DataTypes.BooleanType))
.add("source", DataTypes.StringType) .add("source", DataTypes.StringType)
.add("target", DataTypes.StringType) .add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType); .add("relClass", DataTypes.StringType);
spark spark
.read() .read()
.schema(structureSchema) .schema(structureSchema)
.json(inputPath) .json(inputPath)
.filter( .filter(
"dataInfo.deletedbyinference != true " + "dataInfo.deletedbyinference != true " +
"and relClass == '" + ModelConstants.IS_PRODUCED_BY + "'") "and relClass == '" + ModelConstants.IS_PRODUCED_BY + "'")
.select( .select(
new Column("source").as("resultId"), new Column("source").as("resultId"),
new Column("target").as("projectId")) new Column("target").as("projectId"))
.groupByKey((MapFunction<Row, String>) r -> (String) r.getAs("resultId"), Encoders.STRING()) .groupByKey((MapFunction<Row, String>) r -> (String) r.getAs("resultId"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, ResultProjectList>) (k, v) -> { .mapGroups((MapGroupsFunction<String, Row, ResultProjectList>) (k, v) -> {
ResultProjectList rpl = new ResultProjectList(); ResultProjectList rpl = new ResultProjectList();
rpl.setResultId(k); rpl.setResultId(k);
ArrayList<String> cl = new ArrayList<>(); ArrayList<String> cl = new ArrayList<>();
cl.addAll(projectMap.get(v.next().getAs("projectId"))); cl.addAll(projectMap.get(v.next().getAs("projectId")));
v.forEachRemaining(r -> { v.forEachRemaining(r -> {
projectMap projectMap
.get(r.getAs("projectId")) .get(r.getAs("projectId"))
.forEach(c -> { .forEach(c -> {
if (!cl.contains(c)) if (!cl.contains(c))
cl.add(c); cl.add(c);
}); });
}); });
if (cl.size() == 0) if (cl.size() == 0)
return null; return null;
rpl.setCommunityList(cl); rpl.setCommunityList(cl);
return rpl; return rpl;
}, Encoders.bean(ResultProjectList.class)) }, Encoders.bean(ResultProjectList.class))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
} }
} }

View File

@ -5,22 +5,22 @@ import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
public class ResultProjectList implements Serializable { public class ResultProjectList implements Serializable {
private String resultId; private String resultId;
private ArrayList<String> communityList; private ArrayList<String> communityList;
public String getResultId() { public String getResultId() {
return resultId; return resultId;
} }
public void setResultId(String resultId) { public void setResultId(String resultId) {
this.resultId = resultId; this.resultId = resultId;
} }
public ArrayList<String> getCommunityList() { public ArrayList<String> getCommunityList() {
return communityList; return communityList;
} }
public void setCommunityList(ArrayList<String> communityList) { public void setCommunityList(ArrayList<String> communityList) {
this.communityList = communityList; this.communityList = communityList;
} }
} }

View File

@ -37,127 +37,127 @@ import scala.Tuple2;
* @Date 11/10/23 * @Date 11/10/23
*/ */
public class SparkResultToCommunityFromProject implements Serializable { public class SparkResultToCommunityFromProject implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityFromProject.class); private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityFromProject.class);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
SparkResultToCommunityFromProject.class SparkResultToCommunityFromProject.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/resulttocommunityfromproject/input_communitytoresult_parameters.json")); "/eu/dnetlib/dhp/resulttocommunityfromproject/input_communitytoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser); Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath"); String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final String possibleupdatespath = parser.get("preparedInfoPath"); final String possibleupdatespath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", possibleupdatespath); log.info("preparedInfoPath: {}", possibleupdatespath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
execPropagation(spark, inputPath, outputPath, possibleupdatespath); execPropagation(spark, inputPath, outputPath, possibleupdatespath);
}); });
} }
private static <R extends Result> void execPropagation( private static <R extends Result> void execPropagation(
SparkSession spark, SparkSession spark,
String inputPath, String inputPath,
String outputPath, String outputPath,
String possibleUpdatesPath) { String possibleUpdatesPath) {
Dataset<ResultProjectList> possibleUpdates = readPath(spark, possibleUpdatesPath, ResultProjectList.class); Dataset<ResultProjectList> possibleUpdates = readPath(spark, possibleUpdatesPath, ResultProjectList.class);
ModelSupport.entityTypes ModelSupport.entityTypes
.keySet() .keySet()
.parallelStream() .parallelStream()
.forEach(e -> { .forEach(e -> {
if (ModelSupport.isResult(e)) { if (ModelSupport.isResult(e)) {
removeOutputDir(spark, outputPath + e.name()); removeOutputDir(spark, outputPath + e.name());
Class<R> resultClazz = ModelSupport.entityTypes.get(e); Class<R> resultClazz = ModelSupport.entityTypes.get(e);
Dataset<R> result = readPath(spark, inputPath + e.name(), resultClazz); Dataset<R> result = readPath(spark, inputPath + e.name(), resultClazz);
result result
.joinWith( .joinWith(
possibleUpdates, possibleUpdates,
result.col("id").equalTo(possibleUpdates.col("resultId")), result.col("id").equalTo(possibleUpdates.col("resultId")),
"left_outer") "left_outer")
.map(resultCommunityFn(), Encoders.bean(resultClazz)) .map(resultCommunityFn(), Encoders.bean(resultClazz))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath + e.name()); .json(outputPath + e.name());
} }
}); });
} }
private static <R extends Result> MapFunction<Tuple2<R, ResultProjectList>, R> resultCommunityFn() { private static <R extends Result> MapFunction<Tuple2<R, ResultProjectList>, R> resultCommunityFn() {
return value -> { return value -> {
R ret = value._1(); R ret = value._1();
Optional<ResultProjectList> rcl = Optional.ofNullable(value._2()); Optional<ResultProjectList> rcl = Optional.ofNullable(value._2());
if (rcl.isPresent()) { if (rcl.isPresent()) {
// ArrayList<String> communitySet = rcl.get().getCommunityList(); // ArrayList<String> communitySet = rcl.get().getCommunityList();
List<String> contextList = ret List<String> contextList = ret
.getContext() .getContext()
.stream() .stream()
.map(Context::getId) .map(Context::getId)
.collect(Collectors.toList()); .collect(Collectors.toList());
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
R res = (R) ret.getClass().newInstance(); R res = (R) ret.getClass().newInstance();
res.setId(ret.getId()); res.setId(ret.getId());
List<Context> propagatedContexts = new ArrayList<>(); List<Context> propagatedContexts = new ArrayList<>();
for (String cId : rcl.get().getCommunityList()) { for (String cId : rcl.get().getCommunityList()) {
if (!contextList.contains(cId)) { if (!contextList.contains(cId)) {
Context newContext = new Context(); Context newContext = new Context();
newContext.setId(cId); newContext.setId(cId);
newContext newContext
.setDataInfo( .setDataInfo(
Arrays Arrays
.asList( .asList(
getDataInfo( getDataInfo(
PROPAGATION_DATA_INFO_TYPE, PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RESULT_COMMUNITY_PROJECT_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_PROJECT_CLASS_ID,
PROPAGATION_RESULT_COMMUNITY_PROJECT_CLASS_NAME, PROPAGATION_RESULT_COMMUNITY_PROJECT_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS))); ModelConstants.DNET_PROVENANCE_ACTIONS)));
propagatedContexts.add(newContext); propagatedContexts.add(newContext);
} else { } else {
ret ret
.getContext() .getContext()
.stream() .stream()
.filter(c -> c.getId().equals(cId)) .filter(c -> c.getId().equals(cId))
.findFirst() .findFirst()
.get() .get()
.getDataInfo() .getDataInfo()
.add( .add(
getDataInfo( getDataInfo(
PROPAGATION_DATA_INFO_TYPE, PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RESULT_COMMUNITY_PROJECT_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_PROJECT_CLASS_ID,
PROPAGATION_RESULT_COMMUNITY_PROJECT_CLASS_NAME, PROPAGATION_RESULT_COMMUNITY_PROJECT_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS)); ModelConstants.DNET_PROVENANCE_ACTIONS));
} }
} }
res.setContext(propagatedContexts); res.setContext(propagatedContexts);
ret.mergeFrom(res); ret.mergeFrom(res);
} }
return ret; return ret;
}; };
} }
} }

View File

@ -31,103 +31,103 @@ import eu.dnetlib.dhp.schema.oaf.Dataset;
public class ResultToCommunityJobTest { public class ResultToCommunityJobTest {
private static final Logger log = LoggerFactory.getLogger(ResultToCommunityJobTest.class); private static final Logger log = LoggerFactory.getLogger(ResultToCommunityJobTest.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(ResultToCommunityJobTest.class.getSimpleName()); workingDir = Files.createTempDirectory(ResultToCommunityJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir); log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.setAppName(ResultToCommunityJobTest.class.getSimpleName()); conf.setAppName(ResultToCommunityJobTest.class.getSimpleName());
conf.setMaster("local[*]"); conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost"); conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true"); conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false"); conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString()); conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName(OrcidPropagationJobTest.class.getSimpleName()) .appName(OrcidPropagationJobTest.class.getSimpleName())
.config(conf) .config(conf)
.getOrCreate(); .getOrCreate();
} }
@AfterAll @AfterAll
public static void afterAll() throws IOException { public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile()); FileUtils.deleteDirectory(workingDir.toFile());
spark.stop(); spark.stop();
} }
@Test @Test
void testSparkResultToCommunityFromProjectJob() throws Exception { void testSparkResultToCommunityFromProjectJob() throws Exception {
final String preparedInfoPath = getClass() final String preparedInfoPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttocommunityfromproject/preparedInfo") .getResource("/eu/dnetlib/dhp/resulttocommunityfromproject/preparedInfo")
.getPath(); .getPath();
SparkResultToCommunityFromProject SparkResultToCommunityFromProject
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", getClass() "-sourcePath", getClass()
.getResource("/eu/dnetlib/dhp/resulttocommunityfromproject/sample/") .getResource("/eu/dnetlib/dhp/resulttocommunityfromproject/sample/")
.getPath(), .getPath(),
"-outputPath", workingDir.toString() + "/", "-outputPath", workingDir.toString() + "/",
"-preparedInfoPath", preparedInfoPath "-preparedInfoPath", preparedInfoPath
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset") .textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count()); Assertions.assertEquals(10, tmp.count());
/** /**
* {"resultId":"50|57a035e5b1ae::d5be548ca7ae489d762f893be67af52f","communityList":["aurora"]} * {"resultId":"50|57a035e5b1ae::d5be548ca7ae489d762f893be67af52f","communityList":["aurora"]}
* {"resultId":"50|57a035e5b1ae::a77232ffca9115fcad51c3503dbc7e3e","communityList":["aurora"]} * {"resultId":"50|57a035e5b1ae::a77232ffca9115fcad51c3503dbc7e3e","communityList":["aurora"]}
* {"resultId":"50|57a035e5b1ae::803aaad4decab7e27cd4b52a1931b3a1","communityList":["sdsn-gr"]} * {"resultId":"50|57a035e5b1ae::803aaad4decab7e27cd4b52a1931b3a1","communityList":["sdsn-gr"]}
* {"resultId":"50|57a035e5b1ae::a02e9e4087bca50687731ae5c765b5e1","communityList":["netherlands"]} * {"resultId":"50|57a035e5b1ae::a02e9e4087bca50687731ae5c765b5e1","communityList":["netherlands"]}
*/ */
List<Context> context = tmp List<Context> context = tmp
.filter(r -> r.getId().equals("50|57a035e5b1ae::d5be548ca7ae489d762f893be67af52f")) .filter(r -> r.getId().equals("50|57a035e5b1ae::d5be548ca7ae489d762f893be67af52f"))
.first() .first()
.getContext(); .getContext();
Assertions.assertTrue(context.stream().anyMatch(c -> containsResultCommunityProject(c))); Assertions.assertTrue(context.stream().anyMatch(c -> containsResultCommunityProject(c)));
context = tmp context = tmp
.filter(r -> r.getId().equals("50|57a035e5b1ae::a77232ffca9115fcad51c3503dbc7e3e")) .filter(r -> r.getId().equals("50|57a035e5b1ae::a77232ffca9115fcad51c3503dbc7e3e"))
.first() .first()
.getContext(); .getContext();
Assertions.assertTrue(context.stream().anyMatch(c -> containsResultCommunityProject(c))); Assertions.assertTrue(context.stream().anyMatch(c -> containsResultCommunityProject(c)));
Assertions Assertions
.assertEquals( .assertEquals(
0, tmp.filter(r -> r.getId().equals("50|57a035e5b1ae::803aaad4decab7e27cd4b52a1931b3a1")).count()); 0, tmp.filter(r -> r.getId().equals("50|57a035e5b1ae::803aaad4decab7e27cd4b52a1931b3a1")).count());
Assertions Assertions
.assertEquals( .assertEquals(
0, tmp.filter(r -> r.getId().equals("50|57a035e5b1ae::a02e9e4087bca50687731ae5c765b5e1")).count()); 0, tmp.filter(r -> r.getId().equals("50|57a035e5b1ae::a02e9e4087bca50687731ae5c765b5e1")).count());
Assertions Assertions
.assertEquals( .assertEquals(
2, tmp.filter(r -> r.getContext().stream().anyMatch(c -> c.getId().equals("aurora"))).count()); 2, tmp.filter(r -> r.getContext().stream().anyMatch(c -> c.getId().equals("aurora"))).count());
} }
private static boolean containsResultCommunityProject(Context c) { private static boolean containsResultCommunityProject(Context c) {
return c return c
.getDataInfo() .getDataInfo()
.stream() .stream()
.anyMatch(di -> di.getProvenanceaction().getClassid().equals("result:community:project")); .anyMatch(di -> di.getProvenanceaction().getClassid().equals("result:community:project"));
} }
} }