refactoring

This commit is contained in:
Miriam Baglioni 2020-07-29 17:42:50 +02:00
parent 8ad8dac7d4
commit 074e9ab75e
16 changed files with 579 additions and 544 deletions

View File

@ -396,8 +396,8 @@ public class ResultMapper implements Serializable {
if (contextList.size() > 0) { if (contextList.size() > 0) {
Set<Integer> hashValue = new HashSet<>(); Set<Integer> hashValue = new HashSet<>();
List<Context> remainigContext = new ArrayList<>(); List<Context> remainigContext = new ArrayList<>();
contextList.forEach(c ->{ contextList.forEach(c -> {
if(!hashValue.contains(c.hashCode())){ if (!hashValue.contains(c.hashCode())) {
remainigContext.add(c); remainigContext.add(c);
hashValue.add(c.hashCode()); hashValue.add(c.hashCode());
} }

View File

@ -1,11 +1,7 @@
package eu.dnetlib.dhp.oa.graph.dump.community; package eu.dnetlib.dhp.oa.graph.dump.community;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.Objects; import java.util.Objects;
@ -13,11 +9,16 @@ import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
public class CommunitySplit implements Serializable { public class CommunitySplit implements Serializable {
public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, CommunityMap communityMap) { public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, CommunityMap communityMap) {
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
@ -48,7 +49,7 @@ public class CommunitySplit implements Serializable {
Dataset<CommunityResult> community_products = result Dataset<CommunityResult> community_products = result
.filter(r -> containsCommunity(r, c)); .filter(r -> containsCommunity(r, c));
try{ try {
community_products.first(); community_products.first();
community_products community_products
.repartition(1) .repartition(1)
@ -56,7 +57,7 @@ public class CommunitySplit implements Serializable {
.option("compression", "gzip") .option("compression", "gzip")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.json(outputPath + "/" + c); .json(outputPath + "/" + c);
}catch(Exception e){ } catch (Exception e) {
} }

View File

@ -25,8 +25,8 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.community.Funder;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance; import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.community.Funder;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project; import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;

View File

@ -64,16 +64,10 @@ public class SparkSplitForCommunity implements Serializable {
split.run(isSparkSessionManaged, inputPath, outputPath, communityMap); split.run(isSparkSessionManaged, inputPath, outputPath, communityMap);
} }
public static ISLookUpService getIsLookUpService(String isLookUpUrl) { public static ISLookUpService getIsLookUpService(String isLookUpUrl) {
return ISLookupClientFactory.getLookUpService(isLookUpUrl); return ISLookupClientFactory.getLookUpService(isLookUpUrl);
} }
} }

View File

@ -19,7 +19,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult; import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
public class SparkUpdateProjectInfo implements Serializable { public class SparkUpdateProjectInfo implements Serializable {

View File

@ -46,7 +46,6 @@ public class CreateContextEntities implements Serializable {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
final String hdfsPath = parser.get("hdfsPath"); final String hdfsPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", hdfsPath); log.info("hdfsPath: {}", hdfsPath);
@ -90,8 +89,6 @@ public class CreateContextEntities implements Serializable {
queryInformationSystem.getContextInformation(consumer); queryInformationSystem.getContextInformation(consumer);
} }
protected <R extends ResearchInitiative> void writeEntity(final R r) { protected <R extends ResearchInitiative> void writeEntity(final R r) {
try { try {
writer.write(Utils.OBJECT_MAPPER.writeValueAsString(r)); writer.write(Utils.OBJECT_MAPPER.writeValueAsString(r));

View File

@ -11,7 +11,6 @@ import java.util.Optional;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.function.Function; import java.util.function.Function;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
@ -24,6 +23,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*; import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
public class CreateContextRelation implements Serializable { public class CreateContextRelation implements Serializable {
@ -66,7 +66,10 @@ public class CreateContextRelation implements Serializable {
cce.execute(Process::getRelation, CONTEX_RELATION_DATASOURCE, ModelSupport.getIdPrefix(Datasource.class)); cce.execute(Process::getRelation, CONTEX_RELATION_DATASOURCE, ModelSupport.getIdPrefix(Datasource.class));
log.info("Creating relations for projects... "); log.info("Creating relations for projects... ");
cce.execute(Process::getRelation, CONTEX_RELATION_PROJECT, ModelSupport.getIdPrefix(eu.dnetlib.dhp.schema.oaf.Project.class)); cce
.execute(
Process::getRelation, CONTEX_RELATION_PROJECT,
ModelSupport.getIdPrefix(eu.dnetlib.dhp.schema.oaf.Project.class));
} }
@ -92,14 +95,14 @@ public class CreateContextRelation implements Serializable {
} }
public void execute(final Function<ContextInfo, List<Relation>> producer, String category, String prefix) throws Exception { public void execute(final Function<ContextInfo, List<Relation>> producer, String category, String prefix)
throws Exception {
final Consumer<ContextInfo> consumer = ci -> producer.apply(ci).forEach(c -> writeEntity(c)); final Consumer<ContextInfo> consumer = ci -> producer.apply(ci).forEach(c -> writeEntity(c));
queryInformationSystem.getContextRelation(consumer, category, prefix); queryInformationSystem.getContextRelation(consumer, category, prefix);
} }
protected void writeEntity(final Relation r) { protected void writeEntity(final Relation r) {
try { try {
writer.write(Utils.OBJECT_MAPPER.writeValueAsString(r)); writer.write(Utils.OBJECT_MAPPER.writeValueAsString(r));

View File

@ -11,8 +11,8 @@ import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.dump.oaf.*; import javax.swing.text.html.Option;
import eu.dnetlib.dhp.schema.oaf.Journal;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
@ -26,12 +26,12 @@ import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*; import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
import javax.swing.text.html.Option;
public class DumpGraphEntities implements Serializable { public class DumpGraphEntities implements Serializable {
public void run(Boolean isSparkSessionManaged, public void run(Boolean isSparkSessionManaged,
@ -80,7 +80,8 @@ public class DumpGraphEntities implements Serializable {
} }
private static <E extends OafEntity> void datasourceMap(SparkSession spark, String inputPath, String outputPath, Class<E> inputClazz) { private static <E extends OafEntity> void datasourceMap(SparkSession spark, String inputPath, String outputPath,
Class<E> inputClazz) {
Utils Utils
.readPath(spark, inputPath, inputClazz) .readPath(spark, inputPath, inputClazz)
.map(d -> mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) d), Encoders.bean(Datasource.class)) .map(d -> mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) d), Encoders.bean(Datasource.class))
@ -90,8 +91,6 @@ public class DumpGraphEntities implements Serializable {
.json(outputPath); .json(outputPath);
} }
private static <E extends OafEntity> void projectMap(SparkSession spark, String inputPath, String outputPath, private static <E extends OafEntity> void projectMap(SparkSession spark, String inputPath, String outputPath,
Class<E> inputClazz) { Class<E> inputClazz) {
Utils Utils
@ -110,83 +109,119 @@ public class DumpGraphEntities implements Serializable {
Optional.ofNullable(d.getOriginalId()).ifPresent(oId -> datasource.setOriginalId(oId)); Optional.ofNullable(d.getOriginalId()).ifPresent(oId -> datasource.setOriginalId(oId));
Optional.ofNullable(d.getPid()) Optional
.ifPresent(pids -> pids.stream().map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue())) .ofNullable(d.getPid())
.ifPresent(
pids -> pids
.stream()
.map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())); .collect(Collectors.toList()));
Optional.ofNullable(d.getDatasourcetype()) Optional
.ifPresent(dsType -> datasource.setDatasourcetype(ControlledField.newInstance(dsType.getClassid(), dsType.getClassname()))); .ofNullable(d.getDatasourcetype())
.ifPresent(
dsType -> datasource
.setDatasourcetype(ControlledField.newInstance(dsType.getClassid(), dsType.getClassname())));
Optional.ofNullable(d.getOpenairecompatibility()) Optional
.ofNullable(d.getOpenairecompatibility())
.ifPresent(v -> datasource.setOpenairecompatibility(v.getClassname())); .ifPresent(v -> datasource.setOpenairecompatibility(v.getClassname()));
Optional.ofNullable(d.getOfficialname()) Optional
.ofNullable(d.getOfficialname())
.ifPresent(oname -> datasource.setOfficialname(oname.getValue())); .ifPresent(oname -> datasource.setOfficialname(oname.getValue()));
Optional.ofNullable(d.getEnglishname()) Optional
.ofNullable(d.getEnglishname())
.ifPresent(ename -> datasource.setEnglishname(ename.getValue())); .ifPresent(ename -> datasource.setEnglishname(ename.getValue()));
Optional.ofNullable(d.getWebsiteurl()) Optional
.ofNullable(d.getWebsiteurl())
.ifPresent(wsite -> datasource.setWebsiteurl(wsite.getValue())); .ifPresent(wsite -> datasource.setWebsiteurl(wsite.getValue()));
Optional.ofNullable(d.getLogourl()) Optional
.ofNullable(d.getLogourl())
.ifPresent(lurl -> datasource.setLogourl(lurl.getValue())); .ifPresent(lurl -> datasource.setLogourl(lurl.getValue()));
Optional.ofNullable(d.getDateofvalidation()) Optional
.ofNullable(d.getDateofvalidation())
.ifPresent(dval -> datasource.setDateofvalidation(dval.getValue())); .ifPresent(dval -> datasource.setDateofvalidation(dval.getValue()));
Optional.ofNullable(d.getDescription()) Optional
.ofNullable(d.getDescription())
.ifPresent(dex -> datasource.setDescription(dex.getValue())); .ifPresent(dex -> datasource.setDescription(dex.getValue()));
Optional.ofNullable(d.getSubjects()) Optional
.ifPresent(sbjs -> datasource.setSubjects(sbjs.stream().map(sbj -> sbj.getValue()).collect(Collectors.toList()))); .ofNullable(d.getSubjects())
.ifPresent(
sbjs -> datasource.setSubjects(sbjs.stream().map(sbj -> sbj.getValue()).collect(Collectors.toList())));
Optional.ofNullable(d.getOdpolicies()) Optional
.ifPresent(odp->datasource.setPolicies(Arrays.asList(odp.getValue()))); .ofNullable(d.getOdpolicies())
.ifPresent(odp -> datasource.setPolicies(Arrays.asList(odp.getValue())));
Optional.ofNullable(d.getOdlanguages()) Optional
.ifPresent(langs -> datasource.setLanguages(langs.stream().map(lang -> lang.getValue()).collect(Collectors.toList()))); .ofNullable(d.getOdlanguages())
.ifPresent(
langs -> datasource
.setLanguages(langs.stream().map(lang -> lang.getValue()).collect(Collectors.toList())));
Optional.ofNullable(d.getOdcontenttypes()) Optional
.ifPresent(ctypes -> datasource.setContenttypes(ctypes.stream().map(ctype -> ctype.getValue()).collect(Collectors.toList()))); .ofNullable(d.getOdcontenttypes())
.ifPresent(
ctypes -> datasource
.setContenttypes(ctypes.stream().map(ctype -> ctype.getValue()).collect(Collectors.toList())));
Optional.ofNullable(d.getReleasestartdate()) Optional
.ofNullable(d.getReleasestartdate())
.ifPresent(rd -> datasource.setReleasestartdate(rd.getValue())); .ifPresent(rd -> datasource.setReleasestartdate(rd.getValue()));
Optional.ofNullable(d.getReleaseenddate()) Optional
.ofNullable(d.getReleaseenddate())
.ifPresent(ed -> datasource.setReleaseenddate(ed.getValue())); .ifPresent(ed -> datasource.setReleaseenddate(ed.getValue()));
Optional.ofNullable(d.getMissionstatementurl()) Optional
.ofNullable(d.getMissionstatementurl())
.ifPresent(ms -> datasource.setMissionstatementurl(ms.getValue())); .ifPresent(ms -> datasource.setMissionstatementurl(ms.getValue()));
Optional.ofNullable(d.getDatabaseaccesstype()) Optional
.ofNullable(d.getDatabaseaccesstype())
.ifPresent(ar -> datasource.setAccessrights(ar.getValue())); .ifPresent(ar -> datasource.setAccessrights(ar.getValue()));
Optional.ofNullable(d.getDatauploadtype()) Optional
.ofNullable(d.getDatauploadtype())
.ifPresent(dut -> datasource.setUploadrights(dut.getValue())); .ifPresent(dut -> datasource.setUploadrights(dut.getValue()));
Optional.ofNullable(d.getDatabaseaccessrestriction()) Optional
.ifPresent(dar ->datasource.setDatabaseaccessrestriction(dar.getValue())); .ofNullable(d.getDatabaseaccessrestriction())
.ifPresent(dar -> datasource.setDatabaseaccessrestriction(dar.getValue()));
Optional.ofNullable(d.getDatauploadrestriction()) Optional
.ofNullable(d.getDatauploadrestriction())
.ifPresent(dur -> datasource.setDatauploadrestriction(dur.getValue())); .ifPresent(dur -> datasource.setDatauploadrestriction(dur.getValue()));
Optional.ofNullable(d.getVersioning()) Optional
.ifPresent(v->datasource.setVersioning(v.getValue())); .ofNullable(d.getVersioning())
.ifPresent(v -> datasource.setVersioning(v.getValue()));
Optional.ofNullable(d.getCitationguidelineurl()) Optional
.ofNullable(d.getCitationguidelineurl())
.ifPresent(cu -> datasource.setCitationguidelineurl(cu.getValue())); .ifPresent(cu -> datasource.setCitationguidelineurl(cu.getValue()));
Optional.ofNullable(d.getPidsystems()) Optional
.ofNullable(d.getPidsystems())
.ifPresent(ps -> datasource.setPidsystems(ps.getValue())); .ifPresent(ps -> datasource.setPidsystems(ps.getValue()));
Optional.ofNullable(d.getCertificates()) Optional
.ofNullable(d.getCertificates())
.ifPresent(c -> datasource.setCertificates(c.getValue())); .ifPresent(c -> datasource.setCertificates(c.getValue()));
Optional.ofNullable(d.getPolicies()) Optional
.ofNullable(d.getPolicies())
.ifPresent(ps -> datasource.setPolicies(ps.stream().map(p -> p.getValue()).collect(Collectors.toList()))); .ifPresent(ps -> datasource.setPolicies(ps.stream().map(p -> p.getValue()).collect(Collectors.toList())));
Optional.ofNullable(d.getJournal()) Optional
.ofNullable(d.getJournal())
.ifPresent(j -> datasource.setJournal(getContainer(j))); .ifPresent(j -> datasource.setJournal(getContainer(j)));
return datasource; return datasource;
@ -196,37 +231,48 @@ public class DumpGraphEntities implements Serializable {
private static Container getContainer(Journal j) { private static Container getContainer(Journal j) {
Container c = new Container(); Container c = new Container();
Optional.ofNullable(j.getName()) Optional
.ifPresent(n->c.setName(n)); .ofNullable(j.getName())
.ifPresent(n -> c.setName(n));
Optional.ofNullable(j.getIssnPrinted()) Optional
.ofNullable(j.getIssnPrinted())
.ifPresent(issnp -> c.setIssnPrinted(issnp)); .ifPresent(issnp -> c.setIssnPrinted(issnp));
Optional.ofNullable(j.getIssnOnline()) Optional
.ofNullable(j.getIssnOnline())
.ifPresent(issno -> c.setIssnOnline(issno)); .ifPresent(issno -> c.setIssnOnline(issno));
Optional.ofNullable(j.getIssnLinking()) Optional
.ofNullable(j.getIssnLinking())
.ifPresent(isnl -> c.setIssnLinking(isnl)); .ifPresent(isnl -> c.setIssnLinking(isnl));
Optional.ofNullable(j.getEp()) Optional
.ofNullable(j.getEp())
.ifPresent(ep -> c.setEp(ep)); .ifPresent(ep -> c.setEp(ep));
Optional.ofNullable(j.getIss()) Optional
.ofNullable(j.getIss())
.ifPresent(iss -> c.setIss(iss)); .ifPresent(iss -> c.setIss(iss));
Optional.ofNullable(j.getSp()) Optional
.ofNullable(j.getSp())
.ifPresent(sp -> c.setSp(sp)); .ifPresent(sp -> c.setSp(sp));
Optional.ofNullable(j.getVol()) Optional
.ofNullable(j.getVol())
.ifPresent(vol -> c.setVol(vol)); .ifPresent(vol -> c.setVol(vol));
Optional.ofNullable(j.getEdition()) Optional
.ofNullable(j.getEdition())
.ifPresent(edition -> c.setEdition(edition)); .ifPresent(edition -> c.setEdition(edition));
Optional.ofNullable(j.getConferencedate()) Optional
.ofNullable(j.getConferencedate())
.ifPresent(cdate -> c.setConferencedate(cdate)); .ifPresent(cdate -> c.setConferencedate(cdate));
Optional.ofNullable(j.getConferenceplace()) Optional
.ofNullable(j.getConferenceplace())
.ifPresent(cplace -> c.setConferenceplace(cplace)); .ifPresent(cplace -> c.setConferenceplace(cplace));
return c; return c;
@ -466,8 +512,6 @@ public class DumpGraphEntities implements Serializable {
.map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue())) .map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()))); .collect(Collectors.toList())));
return organization; return organization;
} }
} }

View File

@ -1,5 +1,17 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.graph;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts; import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
@ -13,16 +25,6 @@ import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import java.io.Serializable;
import java.util.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
public class Extractor implements Serializable { public class Extractor implements Serializable {
@ -43,8 +45,6 @@ public class Extractor implements Serializable {
}); });
} }
// private static void extractRelationProjects(SparkSession spark, String inputPath, String outputPath){ // private static void extractRelationProjects(SparkSession spark, String inputPath, String outputPath){
// Utils.readPath(spark, inputPath, Project.class) // Utils.readPath(spark, inputPath, Project.class)
// .flatMap((FlatMapFunction<Project, Relation>) project ->{ // .flatMap((FlatMapFunction<Project, Relation>) project ->{
@ -70,7 +70,6 @@ public class Extractor implements Serializable {
// .json(outputPath); // .json(outputPath);
// } // }
private <R extends Result> void extractRelationResult(SparkSession spark, private <R extends Result> void extractRelationResult(SparkSession spark,
String inputPath, String inputPath,
String outputPath, String outputPath,
@ -88,14 +87,16 @@ public class Extractor implements Serializable {
.ifPresent(inst -> inst.forEach(instance -> { .ifPresent(inst -> inst.forEach(instance -> {
Optional Optional
.ofNullable(instance.getCollectedfrom()) .ofNullable(instance.getCollectedfrom())
.ifPresent(cf -> .ifPresent(
getRelatioPair(value, relationList, cf, cf -> getRelatioPair(
ModelConstants.IS_PROVIDED_BY, ModelConstants.PROVIDES, hashCodes) value, relationList, cf,
); ModelConstants.IS_PROVIDED_BY, ModelConstants.PROVIDES, hashCodes));
Optional Optional
.ofNullable(instance.getHostedby()) .ofNullable(instance.getHostedby())
.ifPresent(hb -> getRelatioPair(value, relationList, hb, .ifPresent(
Constants.IS_HOSTED_BY, Constants.HOSTS , hashCodes) ); hb -> getRelatioPair(
value, relationList, hb,
Constants.IS_HOSTED_BY, Constants.HOSTS, hashCodes));
})); }));
Set<String> communities = communityMap.keySet(); Set<String> communities = communityMap.keySet();
Optional Optional
@ -124,7 +125,7 @@ public class Extractor implements Serializable {
Constants.RESULT_ENTITY, Constants.RESULT_ENTITY,
Constants.CONTEXT_ENTITY, Constants.CONTEXT_ENTITY,
ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, provenance); ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, provenance);
if(!hashCodes.contains(r.hashCode())){ if (!hashCodes.contains(r.hashCode())) {
relationList relationList
.add(r); .add(r);
hashCodes.add(r.hashCode()); hashCodes.add(r.hashCode());
@ -135,7 +136,7 @@ public class Extractor implements Serializable {
Constants.RESULT_ENTITY, Constants.RESULT_ENTITY,
ModelConstants.RELATIONSHIP, ModelConstants.RELATIONSHIP,
ModelConstants.IS_RELATED_TO, provenance); ModelConstants.IS_RELATED_TO, provenance);
if(!hashCodes.contains(r.hashCode())){ if (!hashCodes.contains(r.hashCode())) {
relationList relationList
.add( .add(
r); r);
@ -175,7 +176,7 @@ public class Extractor implements Serializable {
cf.getKey(), Constants.RESULT_ENTITY, Constants.DATASOURCE_ENTITY, cf.getKey(), Constants.RESULT_ENTITY, Constants.DATASOURCE_ENTITY,
result_dtasource, ModelConstants.PROVISION, result_dtasource, ModelConstants.PROVISION,
provenance); provenance);
if(!hashCodes.contains(r.hashCode())){ if (!hashCodes.contains(r.hashCode())) {
relationList relationList
.add(r); .add(r);
hashCodes.add(r.hashCode()); hashCodes.add(r.hashCode());
@ -187,7 +188,7 @@ public class Extractor implements Serializable {
datasource_result, ModelConstants.PROVISION, datasource_result, ModelConstants.PROVISION,
provenance); provenance);
if(!hashCodes.contains(r.hashCode())){ if (!hashCodes.contains(r.hashCode())) {
relationList relationList
.add(r); .add(r);
hashCodes.add(r.hashCode()); hashCodes.add(r.hashCode());

View File

@ -1,5 +1,10 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.dhp.oa.graph.dump.Constants; import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -7,10 +12,6 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance; import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*; import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
public class Process implements Serializable { public class Process implements Serializable {
public static <R extends ResearchInitiative> R getEntity(ContextInfo ci) { public static <R extends ResearchInitiative> R getEntity(ContextInfo ci) {

View File

@ -9,7 +9,6 @@ import java.util.*;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.dom4j.Element; import org.dom4j.Element;
@ -17,6 +16,7 @@ import org.dom4j.Node;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative; import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -52,7 +52,7 @@ public class QueryInformationSystem {
cinfo.setId(cSplit[0]); cinfo.setId(cSplit[0]);
cinfo.setName(cSplit[1]); cinfo.setName(cSplit[1]);
cinfo.setDescription(cSplit[2]); cinfo.setDescription(cSplit[2]);
if (!cSplit[3].trim().equals("")){ if (!cSplit[3].trim().equals("")) {
cinfo.setSubject(Arrays.asList(cSplit[3].split(","))); cinfo.setSubject(Arrays.asList(cSplit[3].split(",")));
} }
cinfo.setZenodocommunity(cSplit[4]); cinfo.setZenodocommunity(cSplit[4]);
@ -98,7 +98,7 @@ public class QueryInformationSystem {
Iterator it = root.elementIterator(); Iterator it = root.elementIterator();
while (it.hasNext()) { while (it.hasNext()) {
Element el = (Element) it.next(); Element el = (Element) it.next();
if(el.getName().equals("category")){ if (el.getName().equals("category")) {
String categoryId = el.attributeValue("id"); String categoryId = el.attributeValue("id");
categoryId = categoryId.substring(categoryId.lastIndexOf("::") + 2); categoryId = categoryId.substring(categoryId.lastIndexOf("::") + 2);
if (categoryId.equals(category)) { if (categoryId.equals(category)) {
@ -119,9 +119,9 @@ public class QueryInformationSystem {
@NotNull @NotNull
private List<String> getCategoryList(Element el, String prefix) { private List<String> getCategoryList(Element el, String prefix) {
List<String> datasourceList = new ArrayList<>(); List<String> datasourceList = new ArrayList<>();
for(Object node : el.selectNodes(".//param")){ for (Object node : el.selectNodes(".//param")) {
Node n = (Node)node; Node n = (Node) node;
if(n.valueOf("./@name").equals("openaireId")){ if (n.valueOf("./@name").equals("openaireId")) {
datasourceList.add(prefix + "|" + n.getText()); datasourceList.add(prefix + "|" + n.getText());
} }
} }

View File

@ -1,13 +1,12 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.graph;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.Optional; import java.util.Optional;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
@ -15,8 +14,10 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
public class SparkCollectAndSave implements Serializable { public class SparkCollectAndSave implements Serializable {
@ -55,20 +56,21 @@ public class SparkCollectAndSave implements Serializable {
}); });
} }
private static void run(SparkSession spark, String inputPath, String outputPath) { private static void run(SparkSession spark, String inputPath, String outputPath) {
Utils.readPath(spark, inputPath + "/result/publication", Result.class) Utils
.readPath(spark, inputPath + "/result/publication", Result.class)
.union(Utils.readPath(spark, inputPath + "/result/dataset", Result.class)) .union(Utils.readPath(spark, inputPath + "/result/dataset", Result.class))
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct" , Result.class)) .union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class))
.union(Utils.readPath(spark, inputPath + "/result/software", Result.class)) .union(Utils.readPath(spark, inputPath + "/result/software", Result.class))
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.json(outputPath); .json(outputPath);
Utils.readPath(spark, inputPath +"/relation/publication", Relation.class) Utils
.readPath(spark, inputPath + "/relation/publication", Relation.class)
.union(Utils.readPath(spark, inputPath + "/relation/dataset", Relation.class)) .union(Utils.readPath(spark, inputPath + "/relation/dataset", Relation.class))
.union(Utils.readPath(spark, inputPath + "/relation/orp", Relation.class)) .union(Utils.readPath(spark, inputPath + "/relation/orp", Relation.class))
.union(Utils.readPath(spark, inputPath + "/relation/software", Relation.class)) .union(Utils.readPath(spark, inputPath + "/relation/software", Relation.class))

View File

@ -1,9 +1,9 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -14,7 +14,6 @@ import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
public class SparkExtractRelationFromEntities implements Serializable { public class SparkExtractRelationFromEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkExtractRelationFromEntities.class); private static final Logger log = LoggerFactory.getLogger(SparkExtractRelationFromEntities.class);
@ -55,8 +54,6 @@ public class SparkExtractRelationFromEntities implements Serializable {
Extractor extractor = new Extractor(); Extractor extractor = new Extractor();
extractor.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMap); extractor.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMap);
} }
} }

View File

@ -1,34 +1,7 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.graph;
import com.google.gson.Gson; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Node;
import eu.dnetlib.dhp.schema.dump.oaf.graph.RelType;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.avro.generic.GenericData;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.IOException; import java.io.IOException;
@ -38,7 +11,31 @@ import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import org.apache.avro.generic.GenericData;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Node;
import eu.dnetlib.dhp.schema.dump.oaf.graph.RelType;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class SparkOrganizationRelation implements Serializable { public class SparkOrganizationRelation implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkOrganizationRelation.class); private static final Logger log = LoggerFactory.getLogger(SparkOrganizationRelation.class);
@ -65,8 +62,8 @@ public class SparkOrganizationRelation implements Serializable {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final OrganizationMap organizationMap = new Gson()
final OrganizationMap organizationMap = new Gson().fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class); .fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class);
log.info("organization map : {}", new Gson().toJson(organizationMap)); log.info("organization map : {}", new Gson().toJson(organizationMap));
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -83,7 +80,8 @@ public class SparkOrganizationRelation implements Serializable {
} }
private static void writeRelations(SparkSession spark, Set<String> rels, String outputPath, OrganizationMap organizationMap) { private static void writeRelations(SparkSession spark, Set<String> rels, String outputPath,
OrganizationMap organizationMap) {
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList = new ArrayList<>(); List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList = new ArrayList<>();
@ -107,15 +105,14 @@ public class SparkOrganizationRelation implements Serializable {
}); });
spark.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class)) spark
.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
} }
private static Set<String> extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap) { private static Set<String> extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap) {
Dataset<Relation> tmp = Utils.readPath(spark, inputPath, Relation.class); Dataset<Relation> tmp = Utils.readPath(spark, inputPath, Relation.class);
Set<String> organizationSet = organizationMap.keySet(); Set<String> organizationSet = organizationMap.keySet();
@ -125,7 +122,7 @@ public class SparkOrganizationRelation implements Serializable {
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo()); Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
if (odInfo.isPresent()) { if (odInfo.isPresent()) {
if (!odInfo.get().getDeletedbyinference()) { if (!odInfo.get().getDeletedbyinference()) {
if(relation.getRelClass().equals(ModelConstants.MERGES)){ if (relation.getRelClass().equals(ModelConstants.MERGES)) {
String oId = relation.getTarget(); String oId = relation.getTarget();
if (organizationSet.contains(oId)) { if (organizationSet.contains(oId)) {
organizationSet.remove(oId); organizationSet.remove(oId);
@ -133,13 +130,12 @@ public class SparkOrganizationRelation implements Serializable {
} }
} }
} }
}}); }
});
toCreateRels.addAll(organizationSet); toCreateRels.addAll(organizationSet);
return toCreateRels; return toCreateRels;
} }
}
}