package eu.dnetlib.dhp.oa.graph.dump; import static eu.dnetlib.dhp.oa.graph.dump.Constants.*; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.jetbrains.annotations.NotNull; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.Gson; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.oa.graph.dump.complete.Constants; import eu.dnetlib.dhp.oa.model.Indicator; import eu.dnetlib.dhp.oa.model.Score; import eu.dnetlib.dhp.oa.model.UsageCounts; import eu.dnetlib.dhp.oa.model.graph.GraphResult; import eu.dnetlib.dhp.oa.model.graph.Relation; import eu.dnetlib.dhp.oa.model.graph.ResearchCommunity; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Measure; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; public class Utils { public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final String ENTITY_ID_SEPARATOR = "|"; private Utils() { } public static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } public static Dataset readPath( SparkSession spark, String inputPath, Class clazz) { return spark .read() .textFile(inputPath) .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } public static String getContextId(String id) { return String .format( "%s::%s", Constants.CONTEXT_NS_PREFIX, DHPUtils.md5(id)); } public static CommunityMap getCommunityMap(SparkSession spark, String communityMapPath) { return new Gson().fromJson(spark.read().textFile(communityMapPath).collectAsList().get(0), CommunityMap.class); } public static CommunityMap readCommunityMap(FileSystem fileSystem, String communityMapPath) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(fileSystem.open(new Path(communityMapPath)))); StringBuilder sb = new StringBuilder(); try { String line; while ((line = br.readLine()) != null) { sb.append(line); } } finally { br.close(); } return new Gson().fromJson(sb.toString(), CommunityMap.class); } public static String getEntityId(String id, String separator) { return id.substring(id.indexOf(separator) + 1); } public static Dataset getEntitiesId(SparkSession spark, String inputPath) { Dataset dumpedIds = Utils .readPath(spark, inputPath + "/publication", GraphResult.class) .map((MapFunction) r -> r.getId(), Encoders.STRING()) .union( Utils .readPath(spark, inputPath + "/dataset", GraphResult.class) .map((MapFunction) r -> r.getId(), Encoders.STRING())) .union( Utils .readPath(spark, inputPath + "/software", GraphResult.class) .map((MapFunction) r -> r.getId(), Encoders.STRING())) .union( Utils .readPath(spark, inputPath + "/otherresearchproduct", GraphResult.class) .map((MapFunction) r -> r.getId(), Encoders.STRING())) .union( Utils .readPath(spark, inputPath + "/organization", eu.dnetlib.dhp.oa.model.graph.Organization.class) .map( (MapFunction) o -> o.getId(), Encoders.STRING())) .union( Utils .readPath(spark, inputPath + "/project", eu.dnetlib.dhp.oa.model.graph.Project.class) .map( (MapFunction) o -> o.getId(), Encoders.STRING())) .union( Utils .readPath(spark, inputPath + "/datasource", eu.dnetlib.dhp.oa.model.graph.Datasource.class) .map( (MapFunction) o -> o.getId(), Encoders.STRING())) .union( Utils .readPath(spark, inputPath + "/communities_infrastructures", ResearchCommunity.class) .map((MapFunction) c -> c.getId(), Encoders.STRING())); return dumpedIds; } public static Dataset getValidRelations(Dataset relations, Dataset entitiesIds) { Dataset> relationSource = relations .map( (MapFunction>) r -> new Tuple2<>(r.getSource(), r), Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class))); Dataset> relJoinSource = relationSource .joinWith(entitiesIds, relationSource.col("_1").equalTo(entitiesIds.col("value"))) .map( (MapFunction, String>, Tuple2>) t2 -> new Tuple2<>( t2._1()._2().getTarget(), t2._1()._2()), Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class))); return relJoinSource .joinWith(entitiesIds, relJoinSource.col("_1").equalTo(entitiesIds.col("value"))) .map( (MapFunction, String>, Relation>) t2 -> t2._1()._2(), Encoders.bean(Relation.class)); } public static Indicator getIndicator(List measures) { Indicator i = new Indicator(); for (eu.dnetlib.dhp.schema.oaf.Measure m : measures) { switch (m.getId()) { case USAGE_COUNT_DOWNLOADS: getUsageCounts(i).setDownloads(m.getUnit().get(0).getValue()); break; case USAGE_COUNT_VIEWS: getUsageCounts(i).setViews(m.getUnit().get(0).getValue()); break; default: getImpactMeasure(i).add(getScore(m.getId(), m.getUnit())); break; } } return i; } @NotNull private static UsageCounts getUsageCounts(Indicator i) { if (i.getUsageCounts() == null) { i.setUsageCounts(new UsageCounts()); } return i.getUsageCounts(); } @NotNull private static List getImpactMeasure(Indicator i) { if (i.getBipIndicators() == null) { i.setBipIndicators(new ArrayList<>()); } return i.getBipIndicators(); } private static Score getScore(String indicator, List unit) { Score s = new Score(); s.setIndicator(indicator); for (KeyValue u : unit) { if (u.getKey().equals("score")) { s.setScore(u.getValue()); } else { s.setClazz(u.getValue()); } } return s; } }