diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 1dc3208b5..b295bc1f1 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -92,6 +92,17 @@ com.squareup.okhttp3 okhttp + + + eu.dnetlib + dnet-pace-core + + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java index fc6b07c55..d66290a76 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java @@ -51,14 +51,12 @@ public class ZenodoAPIClient implements Serializable { /** * Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload - * * @return response code * @throws IOException */ public int newDeposition() throws IOException { String json = "{}"; - OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); - + OkHttpClient httpClient = new OkHttpClient(); RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, json); @@ -89,17 +87,17 @@ public class ZenodoAPIClient implements Serializable { /** * Upload files in Zenodo. - * * @param is the inputStream for the file to upload * @param file_name the name of the file as it will appear on Zenodo * @param len the size of the file * @return the response code */ public int uploadIS(InputStream is, String file_name, long len) throws IOException { - OkHttpClient httpClient = new OkHttpClient.Builder() - .writeTimeout(600, TimeUnit.SECONDS) + OkHttpClient httpClient = new OkHttpClient + .Builder() + .connectTimeout(600, TimeUnit.SECONDS) .readTimeout(600, TimeUnit.SECONDS) - .connectTimeout(600, TimeUnit.SECONDS).build(); + .writeTimeout(600, TimeUnit.SECONDS).build(); Request request = new Request.Builder() .url(bucket + "/" + file_name) @@ -117,14 +115,13 @@ public class ZenodoAPIClient implements Serializable { /** * Associates metadata information to the current deposition - * * @param metadata the metadata * @return response code * @throws IOException */ public int sendMretadata(String metadata) throws IOException { - OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); + OkHttpClient httpClient = new OkHttpClient(); RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, metadata); @@ -148,7 +145,6 @@ public class ZenodoAPIClient implements Serializable { /** * To publish the current deposition. It works for both new deposition or new version of an old deposition - * * @return response code * @throws IOException */ @@ -156,7 +152,7 @@ public class ZenodoAPIClient implements Serializable { String json = "{}"; - OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); + OkHttpClient httpClient = new OkHttpClient(); Request request = new Request.Builder() .url(urlString + "/" + deposition_id + "/actions/publish") @@ -175,12 +171,11 @@ public class ZenodoAPIClient implements Serializable { } /** - * To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used - * for the new version. - * - * @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last - * part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930 - * concept_rec_id = 656930 + * To create a new version of an already published deposition. + * It sets the deposition_id and the bucket to be used for the new version. + * @param concept_rec_id the concept record id of the deposition for which to create a new version. It is + * the last part of the url for the DOI Zenodo suggests to use to cite all versions: + * DOI: 10.xxx/zenodo.656930 concept_rec_id = 656930 * @return response code * @throws IOException * @throws MissingConceptDoiException @@ -189,7 +184,7 @@ public class ZenodoAPIClient implements Serializable { setDepositionId(concept_rec_id); String json = "{}"; - OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); + OkHttpClient httpClient = new OkHttpClient(); Request request = new Request.Builder() .url(urlString + "/" + deposition_id + "/actions/newversion") @@ -211,33 +206,6 @@ public class ZenodoAPIClient implements Serializable { } } - public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException { - - this.deposition_id = deposition_id; - - String json = "{}"; - - OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); - - Request request = new Request.Builder() - .url(urlString + "/" + deposition_id) - .addHeader("Authorization", "Bearer " + access_token) - // .post(RequestBody.create(MEDIA_TYPE_JSON, json)) - .build(); - - try (Response response = httpClient.newCall(request).execute()) { - - if (!response.isSuccessful()) - throw new IOException("Unexpected code " + response + response.body().string()); - - ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class); - bucket = zenodoModel.getLinks().getBucket(); - return response.code(); - - } - - } - private void setDepositionId(String concept_rec_id) throws IOException, MissingConceptDoiException { ZenodoModelList zenodoModelList = new Gson().fromJson(getPrevDepositions(), ZenodoModelList.class); @@ -254,7 +222,7 @@ public class ZenodoAPIClient implements Serializable { } private String getPrevDepositions() throws IOException { - OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); + OkHttpClient httpClient = new OkHttpClient(); Request request = new Request.Builder() .url(urlString) @@ -275,8 +243,7 @@ public class ZenodoAPIClient implements Serializable { } private String getBucket(String url) throws IOException { - OkHttpClient httpClient = new OkHttpClient.Builder() - .connectTimeout(600, TimeUnit.SECONDS).build(); + OkHttpClient httpClient = new OkHttpClient(); Request request = new Request.Builder() .url(url) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java similarity index 92% rename from dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java index ee5fd5165..17482c019 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.dedup; +package eu.dnetlib.dhp.oa.merge; import java.text.Normalizer; import java.util.*; @@ -94,7 +94,13 @@ public class AuthorMerger { if (r.getPid() == null) { r.setPid(new ArrayList<>()); } - r.getPid().add(a._1()); + + // TERRIBLE HACK but for some reason when we create and Array with Arrays.asList, + // it creates of fixed size, and the add method raise UnsupportedOperationException at + // java.util.AbstractList.add + final List tmp = new ArrayList<>(r.getPid()); + tmp.add(a._1()); + r.setPid(tmp); } } }); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java index 9fb60e145..1b5f3c40d 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -5,6 +5,8 @@ import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; +import org.apache.commons.lang3.StringUtils; + import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Sequence; import net.sf.saxon.trans.XPathException; @@ -19,6 +21,8 @@ public class NormalizeDate extends AbstractExtensionFunction { private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'"; + public static final String BLANK = ""; + @Override public String getName() { return "normalizeDate"; @@ -27,10 +31,10 @@ public class NormalizeDate extends AbstractExtensionFunction { @Override public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { if (arguments == null | arguments.length == 0) { - return new StringValue(""); + return new StringValue(BLANK); } String s = arguments[0].head().getStringValue(); - return new StringValue(_year(s)); + return new StringValue(_normalizeDate(s)); } @Override @@ -55,8 +59,8 @@ public class NormalizeDate extends AbstractExtensionFunction { return SequenceType.SINGLE_STRING; } - private String _year(String s) { - final String date = s != null ? s.trim() : ""; + private String _normalizeDate(String s) { + final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK; for (String format : normalizeDateFormats) { try { @@ -66,6 +70,6 @@ public class NormalizeDate extends AbstractExtensionFunction { } catch (ParseException e) { } } - return ""; + return BLANK; } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/H2020Classification.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/H2020Classification.java new file mode 100644 index 000000000..4a61663b8 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/H2020Classification.java @@ -0,0 +1,82 @@ + +package eu.dnetlib.dhp.schema.dump.oaf.graph; + +import java.io.Serializable; + +/** + * To store information about the classification for the project. The classification depends on the programme. For example + * H2020-EU.3.4.5.3 can be classified as + * H2020-EU.3. => Societal Challenges (level1) + * H2020-EU.3.4. => Transport (level2) + * H2020-EU.3.4.5. => CLEANSKY2 (level3) + * H2020-EU.3.4.5.3. => IADP Fast Rotorcraft (level4) + * + * We decided to explicitly represent up to three levels in the classification. + * + * H2020Classification has the following parameters: + * - private Programme programme to store the information about the programme related to this classification + * - private String level1 to store the information about the level 1 of the classification (Priority or Pillar of the EC) + * - private String level2 to store the information about the level2 af the classification (Objectives (?)) + * - private String level3 to store the information about the level3 of the classification + * - private String classification to store the entire classification related to the programme + */ +public class H2020Classification implements Serializable { + private Programme programme; + + private String level1; + private String level2; + private String level3; + + private String classification; + + public Programme getProgramme() { + return programme; + } + + public void setProgramme(Programme programme) { + this.programme = programme; + } + + public String getLevel1() { + return level1; + } + + public void setLevel1(String level1) { + this.level1 = level1; + } + + public String getLevel2() { + return level2; + } + + public void setLevel2(String level2) { + this.level2 = level2; + } + + public String getLevel3() { + return level3; + } + + public void setLevel3(String level3) { + this.level3 = level3; + } + + public String getClassification() { + return classification; + } + + public void setClassification(String classification) { + this.classification = classification; + } + + public static H2020Classification newInstance(String programme_code, String programme_description, String level1, + String level2, String level3, String classification) { + H2020Classification h2020classification = new H2020Classification(); + h2020classification.programme = Programme.newInstance(programme_code, programme_description); + h2020classification.level1 = level1; + h2020classification.level2 = level2; + h2020classification.level3 = level3; + h2020classification.classification = classification; + return h2020classification; + } +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Project.java index b9ab6959b..054e4d2df 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Project.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Project.java @@ -4,32 +4,43 @@ package eu.dnetlib.dhp.schema.dump.oaf.graph; import java.io.Serializable; import java.util.List; -import eu.dnetlib.dhp.schema.dump.oaf.KeyValue; - /** * This is the class representing the Project in the model used for the dumps of the whole graph. At the moment the dump * of the Projects differs from the other dumps because we do not create relations between Funders (Organization) and - * Projects but we put the information about the Funder within the Project representation. We also removed the collected - * from element from the Project. No relation between the Project and the Datasource entity from which it is collected - * will be created. We will never create relations between Project and Datasource. In case some relation will be - * extracted from the Project they will refer the Funder and will be of type ( organization -> funds -> project, project - * -> isFundedBy -> organization) We also removed the duration parameter because the most of times it is set to 0. It - * extends eu.dnetlib.dhp.schema.dump.oaf.Project with the following parameters: - private String websiteurl to store - * the websiteurl of the project - private String startdate to store the start date - private String enddate to store - * the end date - private String callidentifier to store the call indentifier - private String keywords to store the - * keywords - private boolean openaccessmandateforpublications to store if the project must accomplish to the open - * access mandate for publications. This value will be set to true if one of the field in the project represented in the - * internal model is set to true - private boolean openaccessmandatefordataset to store if the project must accomplish - * to the open access mandate for dataset. It is set to the value in the corresponding filed of the project represented - * in the internal model - private List subject to store the list of subjects of the project - private - * List funding to store the list of funder of the project - private String summary to store the summary of the - * project - private Granted granted to store the granted amount - private List programme to store the list - * of programmes the project is related to + * Projects but we put the information about the Funder within the Project representation. We also removed the + * collected from element from the Project. No relation between the Project and the Datasource entity from which it is + * collected will be created. We will never create relations between Project and Datasource. In case some relation will + * be extracted from the Project they will refer the Funder and will be of type ( organization -> funds -> project, + * project -> isFundedBy -> organization) We also removed the duration parameter because the most of times it is set to + * 0. It has the following parameters: + * - private String id to store the id of the project (OpenAIRE id) + * - private String websiteurl to store the websiteurl of the project + * - private String code to store the grant agreement of the project + * - private String acronym to store the acronym of the project + * - private String title to store the tile of the project + * - private String startdate to store the start date + * - private String enddate to store the end date + * - private String callidentifier to store the call indentifier + * - private String keywords to store the keywords + * - private boolean openaccessmandateforpublications to store if the project must accomplish to the open access mandate + * for publications. This value will be set to true if one of the field in the project represented in the internal model + * is set to true + * - private boolean openaccessmandatefordataset to store if the project must accomplish to the open access mandate for + * dataset. It is set to the value in the corresponding filed of the project represented in the internal model + * - private List subject to store the list of subjects of the project + * - private List funding to store the list of funder of the project + * - private String summary to store the summary of the project + * - private Granted granted to store the granted amount + * - private List h2020classification to store the list of H2020 classifications the project is related to */ -public class Project extends eu.dnetlib.dhp.schema.dump.oaf.Project { +public class Project implements Serializable { + private String id; private String websiteurl; + private String code; + private String acronym; + private String title; private String startdate; private String enddate; @@ -49,7 +60,15 @@ public class Project extends eu.dnetlib.dhp.schema.dump.oaf.Project { private Granted granted; - private List programme; + private List h2020Classifications; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } public String getWebsiteurl() { return websiteurl; @@ -59,6 +78,30 @@ public class Project extends eu.dnetlib.dhp.schema.dump.oaf.Project { this.websiteurl = websiteurl; } + public String getCode() { + return code; + } + + public void setCode(String code) { + this.code = code; + } + + public String getAcronym() { + return acronym; + } + + public void setAcronym(String acronym) { + this.acronym = acronym; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + public String getStartdate() { return startdate; } @@ -139,12 +182,11 @@ public class Project extends eu.dnetlib.dhp.schema.dump.oaf.Project { this.granted = granted; } - public List getProgramme() { - return programme; + public List getH2020Classifications() { + return h2020Classifications; } - public void setProgramme(List programme) { - this.programme = programme; + public void setH2020Classifications(List h2020Classifications) { + this.h2020Classifications = h2020Classifications; } - } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Classification.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Classification.java new file mode 100644 index 000000000..219bdc00d --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Classification.java @@ -0,0 +1,88 @@ + +package eu.dnetlib.dhp.schema.oaf; + +import java.io.Serializable; +import java.util.Objects; + +/** + * To store information about the classification for the project. The classification depends on the programme. For example + * H2020-EU.3.4.5.3 can be classified as + * H2020-EU.3. => Societal Challenges (level1) + * H2020-EU.3.4. => Transport (level2) + * H2020-EU.3.4.5. => CLEANSKY2 (level3) + * H2020-EU.3.4.5.3. => IADP Fast Rotorcraft (level4) + * + * We decided to explicitly represent up to three levels in the classification. + * + * H2020Classification has the following parameters: + * - private Programme programme to store the information about the programme related to this classification + * - private String level1 to store the information about the level 1 of the classification (Priority or Pillar of the EC) + * - private String level2 to store the information about the level2 af the classification (Objectives (?)) + * - private String level3 to store the information about the level3 of the classification + * - private String classification to store the entire classification related to the programme + */ + +public class H2020Classification implements Serializable { + private H2020Programme h2020Programme; + private String level1; + private String level2; + private String level3; + + private String classification; + + public H2020Programme getH2020Programme() { + return h2020Programme; + } + + public void setH2020Programme(H2020Programme h2020Programme) { + this.h2020Programme = h2020Programme; + } + + public String getLevel1() { + return level1; + } + + public void setLevel1(String level1) { + this.level1 = level1; + } + + public String getLevel2() { + return level2; + } + + public void setLevel2(String level2) { + this.level2 = level2; + } + + public String getLevel3() { + return level3; + } + + public void setLevel3(String level3) { + this.level3 = level3; + } + + public String getClassification() { + return classification; + } + + public void setClassification(String classification) { + this.classification = classification; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + + H2020Classification h2020classification = (H2020Classification) o; + + return Objects.equals(level1, h2020classification.level1) && + Objects.equals(level2, h2020classification.level2) && + Objects.equals(level3, h2020classification.level3) && + Objects.equals(classification, h2020classification.classification) && + h2020Programme.equals(h2020classification.h2020Programme); + } +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Programme.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Programme.java similarity index 57% rename from dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Programme.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Programme.java index 00dc32fbc..101d46d35 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Programme.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Programme.java @@ -4,7 +4,13 @@ package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; import java.util.Objects; -public class Programme implements Serializable { +/** + * To store information about the ec programme for the project. It has the following parameters: + * - private String code to store the code of the programme + * - private String description to store the description of the programme + */ + +public class H2020Programme implements Serializable { private String code; private String description; @@ -31,8 +37,8 @@ public class Programme implements Serializable { if (o == null || getClass() != o.getClass()) return false; - Programme programme = (Programme) o; - return Objects.equals(code, programme.code); + H2020Programme h2020Programme = (H2020Programme) o; + return Objects.equals(code, h2020Programme.code); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java index 1fcfb305e..b698c957d 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java @@ -58,7 +58,35 @@ public class Project extends OafEntity implements Serializable { private Float fundedamount; - private List programme; + private String h2020topiccode; + + private String h2020topicdescription; + + private List h2020classification; + + public String getH2020topicdescription() { + return h2020topicdescription; + } + + public void setH2020topicdescription(String h2020topicdescription) { + this.h2020topicdescription = h2020topicdescription; + } + + public String getH2020topiccode() { + return h2020topiccode; + } + + public void setH2020topiccode(String h2020topiccode) { + this.h2020topiccode = h2020topiccode; + } + + public List getH2020classification() { + return h2020classification; + } + + public void setH2020classification(List h2020classification) { + this.h2020classification = h2020classification; + } public Field getWebsiteurl() { return websiteurl; @@ -268,14 +296,6 @@ public class Project extends OafEntity implements Serializable { this.fundedamount = fundedamount; } - public List getProgramme() { - return programme; - } - - public void setProgramme(List programme) { - this.programme = programme; - } - @Override public void mergeFrom(OafEntity e) { super.mergeFrom(e); @@ -331,7 +351,9 @@ public class Project extends OafEntity implements Serializable { ? p.getFundedamount() : fundedamount; - programme = mergeLists(programme, p.getProgramme()); + // programme = mergeLists(programme, p.getProgramme()); + + h2020classification = mergeLists(h2020classification, p.getH2020classification()); mergeOAFDataInfo(e); } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index fdd42ab7d..443c18230 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -7,6 +7,8 @@ import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.common.LicenseComparator; + public class Result extends OafEntity implements Serializable { private List measures; @@ -245,7 +247,8 @@ public class Result extends OafEntity implements Serializable { instance = mergeLists(instance, r.getInstance()); - if (r.getBestaccessright() != null && compareTrust(this, r) < 0) + if (r.getBestaccessright() != null + && new LicenseComparator().compare(r.getBestaccessright(), bestaccessright) < 0) bestaccessright = r.getBestaccessright(); if (r.getResulttype() != null && compareTrust(this, r) < 0) diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index a1bc1c483..cf0fa0efe 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -66,6 +66,19 @@ + + + org.apache.poi + poi-ooxml + + + + + org.apache.commons + commons-compress + + + diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java index c6dab13a0..7f0ca983f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java @@ -4,11 +4,13 @@ package eu.dnetlib.dhp.actionmanager.project; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.HashMap; +import java.util.Map; import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import org.slf4j.Logger; @@ -16,11 +18,79 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.csvutils.CSVProgramme; +import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import scala.Tuple2; +/** + * Among all the programmes provided in the csv file, selects those in H2020 framework that have an english title. + * + * The title is then handled to get the programme description at a certain level. The set of programme titles will then + * be used to associate a classification for the programme. + * + * The programme code describes an hierarchy that can be exploited to provide the classification. To determine the hierarchy + * the code can be split by '.'. If the length of the splitted code is less than or equal to 2 it can be directly used + * as the classification: H2020-EU -> Horizon 2020 Framework Programme (It will never be repeated), + * H2020-EU.1. -> Excellent science, H2020-EU.2. -> Industrial leadership etc. + * + * The codes are ordered and for all of them the concatenation of all the titles (from the element in position 1 of + * the splitted code) handled as below is used to create the classification. For example: + * + * H2020-EU.1.1 -> Excellent science | European Research Council (ERC) + * from H2020-EU.1. -> Excellence science and H2020-EU.1.1. -> European Research Council (ERC) + * + * H2020-EU.3.1.3.1. -> Societal challenges | Health, demographic change and well-being | Treating and managing disease | Treating disease, including developing regenerative medicine + * from H2020-EU.3. -> Societal challenges, + * H2020-EU.3.1. -> Health, demographic change and well-being + * H2020-EU.3.1.3 -> Treating and managing disease + * H2020-EU.3.1.3.1. -> Treating disease, including developing regenerative medicine + * + * The classification up to level three, will be split in dedicated variables, while the complete classification will be stored + * in a variable called classification and provided as shown above. + * + * The programme title is not give in a standardized way: + * + * - Sometimes associated to the higher level in the hierarchy we can find Priority in title other times it is not the + * case. Since it is not uniform, we removed priority from the handled titles: + * + * H2020-EU.1. -> PRIORITY 'Excellent science' + * H2020-EU.2. -> PRIORITY 'Industrial leadership' + * H2020-EU.3. -> PRIORITY 'Societal challenges + * + * will become + * + * H2020-EU.1. -> Excellent science + * H2020-EU.2. -> Industrial leadership + * H2020-EU.3. -> Societal challenges + * + * - Sometimes the title of the parent is repeated in the title for the code, but it is not always the case, so, titles + * associated to previous levels in the hierarchy are removed from the code title. + * + * H2020-EU.1.2. -> EXCELLENT SCIENCE - Future and Emerging Technologies (FET) + * H2020-EU.2.2. -> INDUSTRIAL LEADERSHIP - Access to risk finance + * H2020-EU.3.4. -> SOCIETAL CHALLENGES - Smart, Green And Integrated Transport + * + * will become + * + * H2020-EU.1.2. -> Future and Emerging Technologies (FET) + * H2020-EU.2.2. -> Access to risk finance + * H2020-EU.3.4. -> Smart, Green And Integrated Transport + * + * This holds at all levels in the hierarchy. Hence + * + * H2020-EU.2.1.2. -> INDUSTRIAL LEADERSHIP - Leadership in enabling and industrial technologies – Nanotechnologies + * + * will become + * + * H2020-EU.2.1.2. -> Nanotechnologies + * + * - Euratom is not given in the way the other programmes are: H2020-EU. but H2020-Euratom- . So we need to write + * specific code for it + * + * + * + */ public class PrepareProgramme { private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class); @@ -69,49 +139,127 @@ public class PrepareProgramme { private static void exec(SparkSession spark, String programmePath, String outputPath) { Dataset programme = readPath(spark, programmePath, CSVProgramme.class); - programme + JavaRDD h2020Programmes = programme .toJavaRDD() - .filter(p -> !p.getCode().contains("FP7")) + .filter(p -> p.getFrameworkProgramme().trim().equalsIgnoreCase("H2020")) .mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme)) .reduceByKey((a, b) -> { - if (StringUtils.isEmpty(a.getShortTitle())) { - if (StringUtils.isEmpty(b.getShortTitle())) { - if (StringUtils.isEmpty(a.getTitle())) { - if (StringUtils.isNotEmpty(b.getTitle())) { - a.setShortTitle(b.getTitle()); - a.setLanguage(b.getLanguage()); - } - } else {// notIsEmpty a.getTitle - if (StringUtils.isEmpty(b.getTitle())) { - a.setShortTitle(a.getTitle()); - } else { - if (b.getLanguage().equalsIgnoreCase("en")) { - a.setShortTitle(b.getTitle()); - a.setLanguage(b.getLanguage()); - } else { - a.setShortTitle(a.getTitle()); - } - } - } - } else {// not isEmpty b.getShortTitle - a.setShortTitle(b.getShortTitle()); - // a.setLanguage(b.getLanguage()); + if (!a.getLanguage().equals("en")) { + if (b.getLanguage().equalsIgnoreCase("en")) { + a.setTitle(b.getTitle()); + a.setLanguage(b.getLanguage()); } } + if (StringUtils.isEmpty(a.getShortTitle())) { + if (!StringUtils.isEmpty(b.getShortTitle())) { + a.setShortTitle(b.getShortTitle()); + } + } + return a; }) .map(p -> { CSVProgramme csvProgramme = p._2(); - if (StringUtils.isEmpty(csvProgramme.getShortTitle())) { - csvProgramme.setShortTitle(csvProgramme.getTitle()); + String programmeTitle = csvProgramme.getTitle().trim(); + if (programmeTitle.length() > 8 && programmeTitle.substring(0, 8).equalsIgnoreCase("PRIORITY")) { + programmeTitle = programmeTitle.substring(9); + if (programmeTitle.charAt(0) == '\'') { + programmeTitle = programmeTitle.substring(1); + } + if (programmeTitle.charAt(programmeTitle.length() - 1) == '\'') { + programmeTitle = programmeTitle.substring(0, programmeTitle.length() - 1); + } + csvProgramme.setTitle(programmeTitle); } - return OBJECT_MAPPER.writeValueAsString(csvProgramme); - }) + return csvProgramme; + }); + + prepareClassification(h2020Programmes); + + h2020Programmes + .map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme)) .saveAsTextFile(outputPath); } + private static void prepareClassification(JavaRDD h2020Programmes) { + Object[] codedescription = h2020Programmes + .map(value -> new Tuple2<>(value.getCode(), value.getTitle())) + .collect() + .toArray(); + + for (int i = 0; i < codedescription.length - 1; i++) { + for (int j = i + 1; j < codedescription.length; j++) { + Tuple2 t2i = (Tuple2) codedescription[i]; + Tuple2 t2j = (Tuple2) codedescription[j]; + if (t2i._1().compareTo(t2j._1()) > 0) { + Tuple2 temp = t2i; + codedescription[i] = t2j; + codedescription[j] = temp; + } + } + } + + Map map = new HashMap<>(); + for (int j = 0; j < codedescription.length; j++) { + Tuple2 entry = (Tuple2) codedescription[j]; + String ent = entry._1(); + if (ent.contains("Euratom-")) { + ent = ent.replace("-Euratom-", ".Euratom."); + } + String[] tmp = ent.split("\\."); + if (tmp.length <= 2) { + map.put(entry._1(), entry._2()); + + } else { + if (ent.endsWith(".")) { + ent = ent.substring(0, ent.length() - 1); + } + String key = ent.substring(0, ent.lastIndexOf(".") + 1); + if (key.contains("Euratom")) { + key = key.replace(".Euratom.", "-Euratom-"); + ent = ent.replace(".Euratom.", "-Euratom-"); + if (key.endsWith("-")) { + key = key.substring(0, key.length() - 1); + } + } + String current = entry._2(); + if (!ent.contains("Euratom")) { + + String parent; + String tmp_key = tmp[0] + "."; + for (int i = 1; i < tmp.length - 1; i++) { + tmp_key += tmp[i] + "."; + parent = map.get(tmp_key).toLowerCase().trim(); + if (parent.contains("|")) { + parent = parent.substring(parent.lastIndexOf("|") + 1).trim(); + } + if (current.trim().length() > parent.length() + && current.toLowerCase().trim().substring(0, parent.length()).equals(parent)) { + current = current.substring(parent.length() + 1); + if (current.trim().charAt(0) == '-' || current.trim().charAt(0) == '–') { + current = current.trim().substring(1).trim(); + } + + } + } + + } + map.put(ent + ".", map.get(key) + " | " + current); + + } + + } + h2020Programmes.foreach(csvProgramme -> { + if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom") + && !csvProgramme.getCode().equals("H2020-EC")) + csvProgramme.setClassification(map.get(csvProgramme.getCode() + ".")); + else + csvProgramme.setClassification(map.get(csvProgramme.getCode())); + }); + } + public static Dataset readPath( SparkSession spark, String inputPath, Class clazz) { return spark diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java index 78aed1a69..e5cae0ff7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java @@ -6,9 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.*; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -20,12 +18,16 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.csvutils.CSVProgramme; -import eu.dnetlib.dhp.actionmanager.project.csvutils.CSVProject; +import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme; +import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import scala.Tuple2; +/** + * Selects only the relevant information collected with the projects: project grant agreement, project programme code and + * project topic code for the projects that are also collected from OpenAIRE. + */ public class PrepareProjects { private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class); @@ -97,10 +99,14 @@ public class PrepareProjects { if (csvProject.isPresent()) { String[] programme = csvProject.get().getProgramme().split(";"); + String topic = csvProject.get().getTopics(); + Arrays .stream(programme) .forEach(p -> { CSVProject proj = new CSVProject(); + proj.setTopics(topic); + proj.setProgramme(p); proj.setId(csvProject.get().getId()); csvProjectList.add(proj); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java index 2fccbc516..06f8c2fef 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java @@ -3,6 +3,9 @@ package eu.dnetlib.dhp.actionmanager.project; import java.io.Serializable; +/** + * Class to store the grande agreement (code) of the collected projects + */ public class ProjectSubset implements Serializable { private String code; @@ -14,4 +17,5 @@ public class ProjectSubset implements Serializable { public void setCode(String code) { this.code = code; } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java index 2d541d2f9..2bba9fb60 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java @@ -25,6 +25,10 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; +/** + * queries the OpenAIRE database to get the grant agreement of projects collected from corda__h2020. The code collected + * are written on hdfs using the ProjectSubset model + */ public class ReadProjectsFromDB implements Closeable { private final DbClient dbClient; @@ -33,7 +37,7 @@ public class ReadProjectsFromDB implements Closeable { private final BufferedWriter writer; private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private final static String query = "SELECT code " + + private final static String query = "SELECT code " + "from projects where id like 'corda__h2020%' "; public static void main(final String[] args) throws Exception { @@ -72,7 +76,6 @@ public class ReadProjectsFromDB implements Closeable { try { ProjectSubset p = new ProjectSubset(); p.setCode(rs.getString("code")); - return Arrays.asList(p); } catch (final Exception e) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java index 1023e2d19..f2375e799 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java @@ -3,47 +3,54 @@ package eu.dnetlib.dhp.actionmanager.project; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.Objects; import java.util.Optional; -import java.util.function.Consumer; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.SequenceFileOutputFormat; -import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; -import org.apache.spark.rdd.SequenceFileRDDFunctions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.csvutils.CSVProgramme; -import eu.dnetlib.dhp.actionmanager.project.csvutils.CSVProject; +import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme; +import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject; +import eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.Programme; +import eu.dnetlib.dhp.schema.oaf.H2020Classification; +import eu.dnetlib.dhp.schema.oaf.H2020Programme; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.utils.DHPUtils; -import scala.Function1; import scala.Tuple2; -import scala.runtime.BoxedUnit; +/** + * Class that makes the ActionSet. To prepare the AS two joins are needed + * + * 1. join betweem the collected project subset and the programme extenden with the classification on the grant agreement. + * For each entry a + * eu.dnetlib.dhp.Project entity is created and the information about H2020Classification is set together with the + * h2020topiccode variable + * 2. join between the output of the previous step and the topic information on the topic code. Each time a match is + * found the h2020topicdescription variable is set. + * + * To produce one single entry for each project code a step of groupoing is needed: each project can be associated to more + * than one programme. + * + * + */ public class SparkAtomicActionJob { private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -77,6 +84,9 @@ public class SparkAtomicActionJob { final String programmePath = parser.get("programmePath"); log.info("programmePath {}: ", programmePath); + final String topicPath = parser.get("topicPath"); + log.info("topic path {}: ", topicPath); + SparkConf conf = new SparkConf(); runWithSparkSession( @@ -88,6 +98,7 @@ public class SparkAtomicActionJob { spark, projectPath, programmePath, + topicPath, outputPath); }); } @@ -98,31 +109,53 @@ public class SparkAtomicActionJob { private static void getAtomicActions(SparkSession spark, String projectPatH, String programmePath, + String topicPath, String outputPath) { Dataset project = readPath(spark, projectPatH, CSVProject.class); Dataset programme = readPath(spark, programmePath, CSVProgramme.class); + Dataset topic = readPath(spark, topicPath, EXCELTopic.class); - project + Dataset aaproject = project .joinWith(programme, project.col("programme").equalTo(programme.col("code")), "left") - .map(c -> { - CSVProject csvProject = c._1(); - Optional csvProgramme = Optional.ofNullable(c._2()); - if (csvProgramme.isPresent()) { - Project p = new Project(); - p - .setId( - createOpenaireId( - ModelSupport.entityIdPrefix.get("project"), - "corda__h2020", csvProject.getId())); - Programme pm = new Programme(); - pm.setCode(csvProject.getProgramme()); - pm.setDescription(csvProgramme.get().getShortTitle()); - p.setProgramme(Arrays.asList(pm)); - return p; - } + .map((MapFunction, Project>) c -> { - return null; + CSVProject csvProject = c._1(); + Optional ocsvProgramme = Optional.ofNullable(c._2()); + + return Optional + .ofNullable(c._2()) + .map(csvProgramme -> { + Project pp = new Project(); + pp + .setId( + createOpenaireId( + ModelSupport.entityIdPrefix.get("project"), + "corda__h2020", csvProject.getId())); + pp.setH2020topiccode(csvProject.getTopics()); + H2020Programme pm = new H2020Programme(); + H2020Classification h2020classification = new H2020Classification(); + pm.setCode(csvProject.getProgramme()); + h2020classification.setClassification(ocsvProgramme.get().getClassification()); + h2020classification.setH2020Programme(pm); + setLevelsAndProgramme(h2020classification, ocsvProgramme.get().getClassification()); + pp.setH2020classification(Arrays.asList(h2020classification)); + + return pp; + }) + .orElse(null); + + }, Encoders.bean(Project.class)); + + aaproject + .joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code"))) + .map((MapFunction, Project>) p -> { + Optional op = Optional.ofNullable(p._2()); + Project rp = p._1(); + if (op.isPresent()) { + rp.setH2020topicdescription(op.get().getTitle()); + } + return rp; }, Encoders.bean(Project.class)) .filter(Objects::nonNull) .groupByKey( @@ -144,6 +177,18 @@ public class SparkAtomicActionJob { } + private static void setLevelsAndProgramme(H2020Classification h2020Classification, String classification) { + String[] tmp = classification.split(" \\| "); + h2020Classification.setLevel1(tmp[0]); + if (tmp.length > 1) { + h2020Classification.setLevel2(tmp[1]); + } + if (tmp.length > 2) { + h2020Classification.setLevel3(tmp[2]); + } + h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]); + } + public static Dataset readPath( SparkSession spark, String inputPath, Class clazz) { return spark diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProgramme.java deleted file mode 100644 index a9069e510..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProgramme.java +++ /dev/null @@ -1,52 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.csvutils; - -import java.io.Serializable; - -public class CSVProgramme implements Serializable { - private String rcn; - private String code; - private String title; - private String shortTitle; - private String language; - - public String getRcn() { - return rcn; - } - - public void setRcn(String rcn) { - this.rcn = rcn; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public String getShortTitle() { - return shortTitle; - } - - public void setShortTitle(String shortTitle) { - this.shortTitle = shortTitle; - } - - public String getLanguage() { - return language; - } - - public void setLanguage(String language) { - this.language = language; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java similarity index 88% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVParser.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java index ef29a6b6a..8bdce903b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVParser.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.actionmanager.project.csvutils; +package eu.dnetlib.dhp.actionmanager.project.utils; import java.io.IOException; import java.util.ArrayList; @@ -10,6 +10,9 @@ import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVRecord; import org.apache.commons.lang.reflect.FieldUtils; +/** + * Reads a generic csv and maps it into classes that mirror its schema + */ public class CSVParser { public List parse(String csvFile, String classForName) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java new file mode 100644 index 000000000..6f9a59087 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java @@ -0,0 +1,137 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils; + +import java.io.Serializable; + +/** + * The model for the programme csv file + */ +public class CSVProgramme implements Serializable { + private String parentProgramme; + private String frameworkProgramme; + private String startDate; + private String endDate; + private String objective; + private String subjects; + private String legalBasis; + private String call; + private String rcn; + private String code; + + private String title; + private String shortTitle; + private String language; + private String classification; + + public String getClassification() { + return classification; + } + + public void setClassification(String classification) { + this.classification = classification; + } + + public String getRcn() { + return rcn; + } + + public void setRcn(String rcn) { + this.rcn = rcn; + } + + public String getCode() { + return code; + } + + public void setCode(String code) { + this.code = code; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getShortTitle() { + return shortTitle; + } + + public void setShortTitle(String shortTitle) { + this.shortTitle = shortTitle; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getParentProgramme() { + return parentProgramme; + } + + public void setParentProgramme(String parentProgramme) { + this.parentProgramme = parentProgramme; + } + + public String getFrameworkProgramme() { + return frameworkProgramme; + } + + public void setFrameworkProgramme(String frameworkProgramme) { + this.frameworkProgramme = frameworkProgramme; + } + + public String getStartDate() { + return startDate; + } + + public void setStartDate(String startDate) { + this.startDate = startDate; + } + + public String getEndDate() { + return endDate; + } + + public void setEndDate(String endDate) { + this.endDate = endDate; + } + + public String getObjective() { + return objective; + } + + public void setObjective(String objective) { + this.objective = objective; + } + + public String getSubjects() { + return subjects; + } + + public void setSubjects(String subjects) { + this.subjects = subjects; + } + + public String getLegalBasis() { + return legalBasis; + } + + public void setLegalBasis(String legalBasis) { + this.legalBasis = legalBasis; + } + + public String getCall() { + return call; + } + + public void setCall(String call) { + this.call = call; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProject.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java similarity index 97% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProject.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java index ff18c6260..268d5f28c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProject.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java @@ -1,8 +1,11 @@ -package eu.dnetlib.dhp.actionmanager.project.csvutils; +package eu.dnetlib.dhp.actionmanager.project.utils; import java.io.Serializable; +/** + * the mmodel for the projects csv file + */ public class CSVProject implements Serializable { private String rcn; private String id; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java new file mode 100644 index 000000000..0f83499e4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java @@ -0,0 +1,75 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang.reflect.FieldUtils; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +/** + * Reads a generic excel file and maps it into classes that mirror its schema + */ +public class EXCELParser { + + public List parse(InputStream file, String classForName) + throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException, + InvalidFormatException { + + // OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL)); + OPCPackage pkg = OPCPackage.open(file); + XSSFWorkbook wb = new XSSFWorkbook(pkg); + + XSSFSheet sheet = wb.getSheet("cordisref-H2020topics"); + + List ret = new ArrayList<>(); + + DataFormatter dataFormatter = new DataFormatter(); + Iterator rowIterator = sheet.rowIterator(); + List headers = new ArrayList<>(); + int count = 0; + while (rowIterator.hasNext()) { + Row row = rowIterator.next(); + + if (count == 0) { + Iterator cellIterator = row.cellIterator(); + + while (cellIterator.hasNext()) { + Cell cell = cellIterator.next(); + headers.add(dataFormatter.formatCellValue(cell)); + } + } else { + Class clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic"); + final Object cc = clazz.newInstance(); + + for (int i = 0; i < headers.size(); i++) { + Cell cell = row.getCell(i); + String value = dataFormatter.formatCellValue(cell); + FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true); + + } + + EXCELTopic et = (EXCELTopic) cc; + if (StringUtils.isNotBlank(et.getRcn())) { + ret.add((R) cc); + } + + } + + count += 1; + } + + return ret; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELTopic.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELTopic.java new file mode 100644 index 000000000..5607df118 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELTopic.java @@ -0,0 +1,127 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils; + +import java.io.Serializable; + +/** + * the model class for the topic excel file + */ +public class EXCELTopic implements Serializable { + private String rcn; + private String language; + private String code; + private String parentProgramme; + private String frameworkProgramme; + private String startDate; + private String endDate; + private String title; + private String shortTitle; + private String objective; + private String subjects; + private String legalBasis; + private String call; + + public String getRcn() { + return rcn; + } + + public void setRcn(String rcn) { + this.rcn = rcn; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getCode() { + return code; + } + + public void setCode(String code) { + this.code = code; + } + + public String getParentProgramme() { + return parentProgramme; + } + + public void setParentProgramme(String parentProgramme) { + this.parentProgramme = parentProgramme; + } + + public String getFrameworkProgramme() { + return frameworkProgramme; + } + + public void setFrameworkProgramme(String frameworkProgramme) { + this.frameworkProgramme = frameworkProgramme; + } + + public String getStartDate() { + return startDate; + } + + public void setStartDate(String startDate) { + this.startDate = startDate; + } + + public String getEndDate() { + return endDate; + } + + public void setEndDate(String endDate) { + this.endDate = endDate; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getShortTitle() { + return shortTitle; + } + + public void setShortTitle(String shortTitle) { + this.shortTitle = shortTitle; + } + + public String getObjective() { + return objective; + } + + public void setObjective(String objective) { + this.objective = objective; + } + + public String getSubjects() { + return subjects; + } + + public void setSubjects(String subjects) { + this.subjects = subjects; + } + + public String getLegalBasis() { + return legalBasis; + } + + public void setLegalBasis(String legalBasis) { + this.legalBasis = legalBasis; + } + + public String getCall() { + return call; + } + + public void setCall(String call) { + this.call = call; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java similarity index 95% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/ReadCSV.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java index 2b72b229a..9dac34a15 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/ReadCSV.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.actionmanager.project.csvutils; +package eu.dnetlib.dhp.actionmanager.project.utils; import java.io.BufferedWriter; import java.io.Closeable; @@ -20,6 +20,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +/** + * Applies the parsing of a csv file and writes the Serialization of it in hdfs + */ public class ReadCSV implements Closeable { private static final Log log = LogFactory.getLog(ReadCSV.class); private final Configuration conf; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java new file mode 100644 index 000000000..23b58f2a0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -0,0 +1,98 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils; + +import java.io.*; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * Applies the parsing of an excel file and writes the Serialization of it in hdfs + */ + +public class ReadExcel implements Closeable { + private static final Log log = LogFactory.getLog(ReadCSV.class); + private final Configuration conf; + private final BufferedWriter writer; + private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private InputStream excelFile; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + ReadCSV.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/project/parameters.json"))); + + parser.parseArgument(args); + + final String fileURL = parser.get("fileURL"); + final String hdfsPath = parser.get("hdfsPath"); + final String hdfsNameNode = parser.get("hdfsNameNode"); + final String classForName = parser.get("classForName"); + + try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) { + + log.info("Getting Excel file..."); + readExcel.execute(classForName); + + } + } + + public void execute(final String classForName) throws Exception { + EXCELParser excelParser = new EXCELParser(); + excelParser + .parse(excelFile, classForName) + .stream() + .forEach(p -> write(p)); + + } + + @Override + public void close() throws IOException { + writer.close(); + } + + public ReadExcel( + final String hdfsPath, + final String hdfsNameNode, + final String fileURL) + throws Exception { + this.conf = new Configuration(); + this.conf.set("fs.defaultFS", hdfsNameNode); + HttpConnector httpConnector = new HttpConnector(); + FileSystem fileSystem = FileSystem.get(this.conf); + Path hdfsWritePath = new Path(hdfsPath); + FSDataOutputStream fsDataOutputStream = null; + if (fileSystem.exists(hdfsWritePath)) { + fileSystem.delete(hdfsWritePath, false); + } + fsDataOutputStream = fileSystem.create(hdfsWritePath); + + this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); + this.excelFile = httpConnector.getInputSourceAsStream(fileURL); + ; + } + + protected void write(final Object p) { + try { + writer.write(OBJECT_MAPPER.writeValueAsString(p)); + writer.newLine(); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/action_set_parameters.json index a0856e10e..180142044 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/action_set_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/action_set_parameters.json @@ -17,6 +17,12 @@ "paramDescription": "the URL from where to get the programme file", "paramRequired": true }, + { + "paramName": "tp", + "paramLongName": "topicPath", + "paramDescription": "the URL from where to get the topic file", + "paramRequired": true + }, { "paramName": "o", "paramLongName": "outputPath", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/config-default.xml index fe82ae194..a1755f329 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/config-default.xml @@ -31,6 +31,10 @@ spark2SqlQueryExecutionListeners com.cloudera.spark.lineage.NavigatorQueryListener + + oozie.launcher.mapreduce.user.classpath.first + true + sparkExecutorNumber 4 diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml index 1e3445675..c710c8b55 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml @@ -10,6 +10,10 @@ the url where to get the programme file + + topicFileURL + the url where to get the topic file + outputPath path where to store the action set @@ -33,11 +37,11 @@ - eu.dnetlib.dhp.actionmanager.project.csvutils.ReadCSV + eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV --hdfsNameNode${nameNode} --fileURL${projectFileURL} --hdfsPath${workingDir}/projects - --classForNameeu.dnetlib.dhp.actionmanager.project.csvutils.CSVProject + --classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProject @@ -45,11 +49,23 @@ - eu.dnetlib.dhp.actionmanager.project.csvutils.ReadCSV + eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV --hdfsNameNode${nameNode} --fileURL${programmeFileURL} --hdfsPath${workingDir}/programme - --classForNameeu.dnetlib.dhp.actionmanager.project.csvutils.CSVProgramme + --classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme + + + + + + + + eu.dnetlib.dhp.actionmanager.project.utils.ReadExcel + --hdfsNameNode${nameNode} + --fileURL${topicFileURL} + --hdfsPath${workingDir}/topic + --classForNameeu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic @@ -136,6 +152,7 @@ --projectPath${workingDir}/preparedProjects --programmePath${workingDir}/preparedProgramme + --topicPath${workingDir}/topic --outputPath${outputPath} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/CSVParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/CSVParserTest.java index 17fdd4511..da5beecf9 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/CSVParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/CSVParserTest.java @@ -1,27 +1,16 @@ package eu.dnetlib.dhp.actionmanager.project; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.List; import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; -import eu.dnetlib.dhp.actionmanager.project.csvutils.CSVParser; +import eu.dnetlib.dhp.actionmanager.project.utils.CSVParser; public class CSVParserTest { - private static Path workingDir; - - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(CSVParserTest.class.getSimpleName()); - - } - @Test public void readProgrammeTest() throws Exception { @@ -33,9 +22,10 @@ public class CSVParserTest { CSVParser csvParser = new CSVParser(); - List pl = csvParser.parse(programmecsv, "eu.dnetlib.dhp.actionmanager.project.csvutils.CSVProgramme"); + List pl = csvParser.parse(programmecsv, "eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme"); - System.out.println(pl.size()); + Assertions.assertEquals(24, pl.size()); } + } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java new file mode 100644 index 000000000..fdc577687 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.actionmanager.project; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException; +import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; +import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; + +public class EXCELParserTest { + + private static Path workingDir; + private HttpConnector httpConnector = new HttpConnector(); + private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx"; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(CSVParserTest.class.getSimpleName()); + + } + + @Test + public void test1() throws CollectorServiceException, IOException, InvalidFormatException, ClassNotFoundException, + IllegalAccessException, InstantiationException { + + EXCELParser excelParser = new EXCELParser(); + + List pl = excelParser + .parse(httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic"); + + Assertions.assertEquals(3837, pl.size()); + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgrammeTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java similarity index 53% rename from dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgrammeTest.java rename to dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java index 7f890a8a3..c5801bccc 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgrammeTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java @@ -21,29 +21,29 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.csvutils.CSVProgramme; +import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme; -public class PrepareProgrammeTest { +public class PrepareH2020ProgrammeTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = eu.dnetlib.dhp.actionmanager.project.PrepareProgrammeTest.class + private static final ClassLoader cl = PrepareH2020ProgrammeTest.class .getClassLoader(); private static SparkSession spark; private static Path workingDir; private static final Logger log = LoggerFactory - .getLogger(eu.dnetlib.dhp.actionmanager.project.PrepareProgrammeTest.class); + .getLogger(PrepareH2020ProgrammeTest.class); @BeforeAll public static void beforeAll() throws IOException { workingDir = Files - .createTempDirectory(eu.dnetlib.dhp.actionmanager.project.PrepareProgrammeTest.class.getSimpleName()); + .createTempDirectory(PrepareH2020ProgrammeTest.class.getSimpleName()); log.info("using work dir {}", workingDir); SparkConf conf = new SparkConf(); - conf.setAppName(eu.dnetlib.dhp.actionmanager.project.PrepareProgrammeTest.class.getSimpleName()); + conf.setAppName(PrepareH2020ProgrammeTest.class.getSimpleName()); conf.setMaster("local[*]"); conf.set("spark.driver.host", "localhost"); @@ -54,7 +54,7 @@ public class PrepareProgrammeTest { spark = SparkSession .builder() - .appName(PrepareProgrammeTest.class.getSimpleName()) + .appName(PrepareH2020ProgrammeTest.class.getSimpleName()) .config(conf) .getOrCreate(); } @@ -88,7 +88,60 @@ public class PrepareProgrammeTest { Dataset verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(CSVProgramme.class)); - Assertions.assertEquals(0, verificationDataset.filter("shortTitle =''").count()); + Assertions.assertEquals(0, verificationDataset.filter("title =''").count()); + + Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count()); + + Assertions + .assertEquals( + "Societal challenges | Smart, Green And Integrated Transport | CLEANSKY2 | IADP Fast Rotorcraft", + verificationDataset + .filter("code = 'H2020-EU.3.4.5.3.'") + .select("classification") + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertEquals( + "Euratom | Indirect actions | European Fusion Development Agreement", + verificationDataset + .filter("code = 'H2020-Euratom-1.9.'") + .select("classification") + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertEquals( + "Industrial leadership | Leadership in enabling and industrial technologies | Advanced manufacturing and processing | New sustainable business models", + verificationDataset + .filter("code = 'H2020-EU.2.1.5.4.'") + .select("classification") + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertEquals( + "Excellent science | Future and Emerging Technologies (FET) | FET Open", + verificationDataset + .filter("code = 'H2020-EU.1.2.1.'") + .select("classification") + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertEquals( + "Industrial leadership | Leadership in enabling and industrial technologies | Biotechnology", + verificationDataset + .filter("code = 'H2020-EU.2.1.4.'") + .select("classification") + .collectAsList() + .get(0) + .getString(0)); + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjectTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjectTest.java index 5ff88e46f..0db3485f5 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjectTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjectTest.java @@ -21,8 +21,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.csvutils.CSVProgramme; -import eu.dnetlib.dhp.actionmanager.project.csvutils.CSVProject; +import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject; public class PrepareProjectTest { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/SparkUpdateProjectTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/SparkUpdateProjectTest.java index 718cd8ebe..cfda7e718 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/SparkUpdateProjectTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/SparkUpdateProjectTest.java @@ -4,12 +4,16 @@ package eu.dnetlib.dhp.actionmanager.project; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; @@ -73,10 +77,13 @@ public class SparkUpdateProjectTest { Boolean.FALSE.toString(), "-programmePath", getClass() - .getResource("/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz") + .getResource( + "/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz") .getPath(), "-projectPath", getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json").getPath(), + "-topicPath", + getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/topic.json.gz").getPath(), "-outputPath", workingDir.toString() + "/actionSet" }); @@ -88,7 +95,231 @@ public class SparkUpdateProjectTest { .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Project) aa.getPayload())); - Assertions.assertEquals(14, tmp.count()); + Assertions.assertEquals(15, tmp.count()); + + Dataset verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Project.class)); + verificationDataset.createOrReplaceTempView("project"); + + Dataset execverification = spark + .sql( + "SELECT id, class classification, h2020topiccode, h2020topicdescription FROM project LATERAL VIEW EXPLODE(h2020classification) c as class "); + + Assertions + .assertEquals( + "H2020-EU.3.4.7.", + execverification + .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") + .select("classification.h2020Programme.code") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "SESAR JU", + execverification + .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") + .select("classification.h2020Programme.description") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Societal challenges", + execverification + .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") + .select("classification.level1") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Smart, Green And Integrated Transport", + execverification + .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") + .select("classification.level2") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "SESAR JU", + execverification + .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") + .select("classification.level3") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Societal challenges | Smart, Green And Integrated Transport | SESAR JU", + execverification + .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") + .select("classification.classification") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "SESAR-ER4-31-2019", + execverification + .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") + .select("h2020topiccode") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "U-space", + execverification + .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") + .select("h2020topicdescription") + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertEquals( + "H2020-EU.1.3.2.", + execverification + .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") + .select("classification.h2020Programme.code") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Nurturing excellence by means of cross-border and cross-sector mobility", + execverification + .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") + .select("classification.h2020Programme.description") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Excellent science", + execverification + .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") + .select("classification.level1") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Marie Skłodowska-Curie Actions", + execverification + .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") + .select("classification.level2") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Nurturing excellence by means of cross-border and cross-sector mobility", + execverification + .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") + .select("classification.level3") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Excellent science | Marie Skłodowska-Curie Actions | Nurturing excellence by means of cross-border and cross-sector mobility", + execverification + .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") + .select("classification.classification") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "MSCA-IF-2019", + execverification + .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") + .select("h2020topiccode") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Individual Fellowships", + execverification + .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") + .select("h2020topicdescription") + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertTrue( + execverification + .filter("id = '40|corda__h2020::a657c271769fec90b60c1f2dbc25f4d5'") + .select("classification.h2020Programme.code") + .collectAsList() + .get(0) + .getString(0) + .equals("H2020-EU.2.1.4.") || + execverification + .filter("id = '40|corda__h2020::a657c271769fec90b60c1f2dbc25f4d5'") + .select("classification.h2020Programme.code") + .collectAsList() + .get(1) + .getString(0) + .equals("H2020-EU.2.1.4.")); + Assertions + .assertTrue( + execverification + .filter("id = '40|corda__h2020::a657c271769fec90b60c1f2dbc25f4d5'") + .select("classification.h2020Programme.code") + .collectAsList() + .get(0) + .getString(0) + .equals("H2020-EU.3.2.6.") || + execverification + .filter("id = '40|corda__h2020::a657c271769fec90b60c1f2dbc25f4d5'") + .select("classification.h2020Programme.code") + .collectAsList() + .get(1) + .getString(0) + .equals("H2020-EU.3.2.6.")); + Assertions + .assertEquals( + "Biotechnology", + execverification + .filter( + "id = '40|corda__h2020::a657c271769fec90b60c1f2dbc25f4d5' and classification.h2020Programme.code = 'H2020-EU.2.1.4.'") + .select("classification.h2020Programme.description") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Bio-based Industries Joint Technology Initiative (BBI-JTI)", + execverification + .filter( + "id = '40|corda__h2020::a657c271769fec90b60c1f2dbc25f4d5' and classification.h2020Programme.code = 'H2020-EU.3.2.6.'") + .select("classification.h2020Programme.description") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "BBI-2019-SO3-D4", + execverification + .filter("id = '40|corda__h2020::a657c271769fec90b60c1f2dbc25f4d5'") + .select("h2020topiccode") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "Demonstrate bio-based pesticides and/or biostimulant agents for sustainable increase in agricultural productivity", + execverification + .filter("id = '40|corda__h2020::a657c271769fec90b60c1f2dbc25f4d5'") + .select("h2020topicdescription") + .collectAsList() + .get(0) + .getString(0)); } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java index 51a7019ca..8bcf08906 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java @@ -13,7 +13,7 @@ public class HttpConnectorTest { private static final Log log = LogFactory.getLog(HttpConnectorTest.class); private static HttpConnector connector; - private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020programmes.csv"; + private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx"; private static final String URL_MISCONFIGURED_SERVER = "https://www.alexandria.unisg.ch/cgi/oai2?verb=Identify"; private static final String URL_GOODSNI_SERVER = "https://air.unimi.it/oai/openaire?verb=Identify"; diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz new file mode 100644 index 000000000..d40bac8c6 Binary files /dev/null and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz differ diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json index b8805b2db..058ce8877 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json @@ -3,7 +3,7 @@ {"rcn":"229281","id":"896300","acronym":"STRETCH","status":"SIGNED","programme":"H2020-EU.1.3.2.","topics":"MSCA-IF-2019","frameworkProgramme":"H2020","title":"Smart Textiles for RETrofitting and Monitoring of Cultural Heritage Buildings","startDate":"2020-09-01","endDate":"2022-08-31","projectUrl":"","objective":"This project aims to develop novel techniques using smart multifunctional materials for the combined seismic-plus-energy retrofitting, and Structural Health Monitoring (SHM) of the European cultural heritage buildings (CHB). The need for upgrading the existing old and CHB is becoming increasingly important for the EU countries, due to: (1) their poor structural performance during recent earthquakes (e.g. Italy, Greece) or other natural hazards (e.g. extreme weather conditions) that have resulted in significant economic losses, and loss of human lives; and (2) their low energy performance which increases significantly their energy consumption (buildings are responsible for 40% of EU energy consumption). Moreover, the SHM of the existing buildings is crucial for assessing continuously their structural integrity and thus to provide information for planning cost effective and sustainable maintenance decisions. Since replacing the old buildings with new is not financially feasible, and even it is not allowed for CHB, their lifetime extension requires considering simultaneously both structural and energy retrofitting. It is noted that the annual cost of repair and maintenance of existing European building stock is estimated to be about 50% of the total construction budget, currently standing at more than €300 billion. To achieve cost effectiveness, STRETCH explores a novel approach, which integrates technical textile reinforcement with thermal insulation systems and strain sensors to provide simultaneous structural-plus-energy retrofitting combined with SHM, tailored for masonry cultural heritage building envelopes. The effectiveness of the proposed retrofitting system will be validated experimentally and analytically. Moreover, draft guidelines and recommendations for determining future research on the use of smart composite materials for the concurrent retrofitting (structural-plus-energy) and SHM of the existing cultural heritage buildings envelopes will be proposed.","totalCost":"183473,28","ecMaxContribution":"183473,28","call":"H2020-MSCA-IF-2019","fundingScheme":"MSCA-IF-EF-ST","coordinator":"JRC -JOINT RESEARCH CENTRE- EUROPEAN COMMISSION","coordinatorCountry":"BE","participants":"","participantCountries":"","subjects":""} {"rcn":"229265","id":"892890","acronym":"RhythmicPrediction","status":"SIGNED","programme":"H2020-EU.1.3.2.","topics":"MSCA-IF-2019","frameworkProgramme":"H2020","title":"Rhythmic prediction in speech perception: are our brain waves in sync with our native language?","startDate":"2021-01-01","endDate":"2022-12-31","projectUrl":"","objective":"Speech has rhythmic properties that widely differ across languages. When we listen to foreign languages, we may perceive them to be more musical, or rather more rap-like than our own. Even if we are unaware of it, the rhythm and melody of language, i.e. prosody, reflects its linguistic structure. On the one hand, prosody emphasizes content words and new information with stress and accents. On the other hand, it is aligned to phrase edges, marking them with boundary tones. Prosody hence helps the listener to focus on important words and to chunk sentences into phrases, and phrases into words. In fact, prosody is even used predictively, for instance to time the onset of the next word, the next piece of new information, or the total remaining length of the utterance, so the listener can seamlessly start their own speaking turn. \nSo, the listener, or rather their brain, is actively predicting when important speech events will happen, using prosody. How prosodic rhythms are exploited to predict speech timing, however, is unclear. No link between prosody and neural predictive processing has yet been empirically made. One hypothesis is that rhythm, such as the alternation of stressed and unstressed syllables, helps listeners time their attention. Similar behavior is best captured by the notion of an internal oscillator which can be set straight by attentional spikes. While neuroscientific evidence for the relation of neural oscillators to speech processing is starting to emerge, no link to the use of prosody nor predictive listening exists, yet. Furthermore, it is still unknown how native language knowledge affects cortical oscillations, and how oscillations are affected by cross-linguistic differences in rhythmic structure. The current project combines the standing knowledge of prosodic typology with the recent advances in neuroscience on cortical oscillations, to investigate the role of internal oscillators on native prosody perception, and active speech prediction.","totalCost":"191149,44","ecMaxContribution":"191149,44","call":"H2020-MSCA-IF-2019","fundingScheme":"MSCA-IF-EF-ST","coordinator":"UNIVERSITE DE GENEVE","coordinatorCountry":"CH","participants":"","participantCountries":"","subjects":""} {"rcn":"229235","id":"886828","acronym":"ASAP","status":"SIGNED","programme":"H2020-EU.1.3.2.","topics":"MSCA-IF-2019","frameworkProgramme":"H2020","title":"Advanced Solutions for Asphalt Pavements","startDate":"2021-09-01","endDate":"2023-08-31","projectUrl":"","objective":"The Advanced Solutions for Asphalt Pavements (ASAP) project involves the development of a unique road paving technology which will use a bio-bitumen rejuvenator to rejuvenate aged asphalt bitumen. This technology will help to extend the lifespan of asphalt pavements (roads) and will reduce the environmental and economic impact of roads and road maintenance processes. Recycling and self-healing processes will replace fossil fuel dependent technology. Self-healing will involve rejuvenating aged asphalt bitumen using a bio-rejuvenator developed using microalgae oils (rejuvenating bio-oil). Microalgae has been selected because of its fast growth, versatility and ability to survive within hostile environments, such as wastewater. \n\nASAP will utilise microalgae, cultivated within the wastewater treatment process, as a source of the rejuvenating bio-oil. The solvent (Soxhlet) processes will be used to extract the oil from the microalgae. To ensure the efficiency of the oil extraction process, an ultrasonication process will be used to pre-treat the microalgae. The suitability of rejuvenating bio-oil as a replacement for the bitumen rejuvenator (fossil fuel based) will be ascertained via a series of standard bituminous and accelerated tests. A rejuvenator-binder diffusion numerical model will be developed, based on the Delft Lattice concrete diffusion model, to determine the conditions required for rejuvenation to occur and to ascertain the healing rate of the asphalt binder. These parameters will facilitate the selection and optimisation of the asphalt self-healing systems (specifically the amount of bio-oil rejuvenator and time required) to achieve full rejuvenation. \n\nThis novel approach will benchmark the effectiveness of this intervention against existing asphalt design and maintenance processes and assess feasibility. The ASAP project presents an opportunity to revolutionise road design and maintenance processes and reduce its environmental and financial costs.","totalCost":"187572,48","ecMaxContribution":"187572,48","call":"H2020-MSCA-IF-2019","fundingScheme":"MSCA-IF-EF-ST","coordinator":"NEDERLANDSE ORGANISATIE VOOR TOEGEPAST NATUURWETENSCHAPPELIJK ONDERZOEK TNO","coordinatorCountry":"NL","participants":"","participantCountries":"","subjects":""} -{"rcn":null,"id":"886776","acronym":null,"status":null,"programme":"H2020-EU.2.1.4.","topics":null,"frameworkProgramme":"H2020","title":"BIO-Based pESTicides production for sustainable agriculture management plan","startDate":"2020-05-01","endDate":"2023-04-30","projectUrl":"","objective":"The BIOBESTicide project will validate and demonstrate the production of an effective and cost-efficient biopesticide. The demonstration will be based on an innovative bio-based value chain starting from the valorisation of sustainable biomasses, i.e. beet pulp and sugar molasses and will exploit the properties of the oomycete Pythium oligandrum strain I-5180 to increase natural plant defenses, to produce an highly effective and eco-friendly biopesticide solution for vine plants protection. \nBIOVITIS, the project coordinator, has developed, at laboratory level (TRL4), an effective method to biocontrol one of the major causes of worldwide vineyards destruction, the Grapevine Trunk Diseases (GTDs). The protection system is based on the oomycete Pythium oligandrum strain I-5180 that, at applied at optimal time and concentration, colonises the root of vines and stimulates the natural plant defences against GTDs, providing a protection that ranges between 40% and 60%. \nBIOBESTicide project will respond to the increasing demands for innovative solutions for crop protection agents, transferring the technology to a DEMO Plant able to produce more than 10 T of a high-quality oomycete-based biopesticide product per year (TRL7). \nThe BIOBESTicide project will validate the efficiency of the formulated product on vineyards of different geographical areas.\nTo assure the safety of products under both health and environmental points of view, a full and complete approval dossier for Pythium oligandrum strain I-5180 will be submitted in all the European countries. \nA Life Cycle Sustainability Assessment (LCSA) will be conducted to assess the environmental, economic and social impacts of the developed products.\nThe adoption of the effective and cost-efficient biopesticide will have significant impacts with a potential ROI of 30 % in just 5 years and a total EBITDA of more than € 6,400,000.","totalCost":"4402772,5","ecMaxContribution":"3069653","call":"H2020-BBI-JTI-2019","fundingScheme":"BBI-IA-DEMO","coordinator":"BIOVITIS","coordinatorCountry":"FR","participants":"MERCIER FRERES SARL;FUNDACION TECNALIA RESEARCH & INNOVATION;LAMBERTI SPA;EURION CONSULTING;CIAOTECH Srl;STOWARZYSZENIE ZACHODNIOPOMORSKI KLASTER CHEMICZNY ZIELONA CHEMIA;NORDZUCKER AG;INSTITUT NATIONAL DE RECHERCHE POUR L'AGRICULTURE, L'ALIMENTATION ET L'ENVIRONNEMENT;INSTITUT FRANCAIS DE LA VIGNE ET DU VIN","participantCountries":"FR;ES;IT;PL;DE","subjects":""} +{"rcn":null,"id":"886776","acronym":null,"status":null,"programme":"H2020-EU.2.1.4.","topics":"BBI-2019-SO3-D4","frameworkProgramme":"H2020","title":"BIO-Based pESTicides production for sustainable agriculture management plan","startDate":"2020-05-01","endDate":"2023-04-30","projectUrl":"","objective":"The BIOBESTicide project will validate and demonstrate the production of an effective and cost-efficient biopesticide. The demonstration will be based on an innovative bio-based value chain starting from the valorisation of sustainable biomasses, i.e. beet pulp and sugar molasses and will exploit the properties of the oomycete Pythium oligandrum strain I-5180 to increase natural plant defenses, to produce an highly effective and eco-friendly biopesticide solution for vine plants protection. \nBIOVITIS, the project coordinator, has developed, at laboratory level (TRL4), an effective method to biocontrol one of the major causes of worldwide vineyards destruction, the Grapevine Trunk Diseases (GTDs). The protection system is based on the oomycete Pythium oligandrum strain I-5180 that, at applied at optimal time and concentration, colonises the root of vines and stimulates the natural plant defences against GTDs, providing a protection that ranges between 40% and 60%. \nBIOBESTicide project will respond to the increasing demands for innovative solutions for crop protection agents, transferring the technology to a DEMO Plant able to produce more than 10 T of a high-quality oomycete-based biopesticide product per year (TRL7). \nThe BIOBESTicide project will validate the efficiency of the formulated product on vineyards of different geographical areas.\nTo assure the safety of products under both health and environmental points of view, a full and complete approval dossier for Pythium oligandrum strain I-5180 will be submitted in all the European countries. \nA Life Cycle Sustainability Assessment (LCSA) will be conducted to assess the environmental, economic and social impacts of the developed products.\nThe adoption of the effective and cost-efficient biopesticide will have significant impacts with a potential ROI of 30 % in just 5 years and a total EBITDA of more than € 6,400,000.","totalCost":"4402772,5","ecMaxContribution":"3069653","call":"H2020-BBI-JTI-2019","fundingScheme":"BBI-IA-DEMO","coordinator":"BIOVITIS","coordinatorCountry":"FR","participants":"MERCIER FRERES SARL;FUNDACION TECNALIA RESEARCH & INNOVATION;LAMBERTI SPA;EURION CONSULTING;CIAOTECH Srl;STOWARZYSZENIE ZACHODNIOPOMORSKI KLASTER CHEMICZNY ZIELONA CHEMIA;NORDZUCKER AG;INSTITUT NATIONAL DE RECHERCHE POUR L'AGRICULTURE, L'ALIMENTATION ET L'ENVIRONNEMENT;INSTITUT FRANCAIS DE LA VIGNE ET DU VIN","participantCountries":"FR;ES;IT;PL;DE","subjects":""} {"rcn":null,"id":"886776","acronym":null,"status":null,"programme":"H2020-EU.3.2.6.","topics":"BBI-2019-SO3-D4","frameworkProgramme":"H2020","title":"BIO-Based pESTicides production for sustainable agriculture management plan","startDate":"2020-05-01","endDate":"2023-04-30","projectUrl":"","objective":"The BIOBESTicide project will validate and demonstrate the production of an effective and cost-efficient biopesticide. The demonstration will be based on an innovative bio-based value chain starting from the valorisation of sustainable biomasses, i.e. beet pulp and sugar molasses and will exploit the properties of the oomycete Pythium oligandrum strain I-5180 to increase natural plant defenses, to produce an highly effective and eco-friendly biopesticide solution for vine plants protection. \nBIOVITIS, the project coordinator, has developed, at laboratory level (TRL4), an effective method to biocontrol one of the major causes of worldwide vineyards destruction, the Grapevine Trunk Diseases (GTDs). The protection system is based on the oomycete Pythium oligandrum strain I-5180 that, at applied at optimal time and concentration, colonises the root of vines and stimulates the natural plant defences against GTDs, providing a protection that ranges between 40% and 60%. \nBIOBESTicide project will respond to the increasing demands for innovative solutions for crop protection agents, transferring the technology to a DEMO Plant able to produce more than 10 T of a high-quality oomycete-based biopesticide product per year (TRL7). \nThe BIOBESTicide project will validate the efficiency of the formulated product on vineyards of different geographical areas.\nTo assure the safety of products under both health and environmental points of view, a full and complete approval dossier for Pythium oligandrum strain I-5180 will be submitted in all the European countries. \nA Life Cycle Sustainability Assessment (LCSA) will be conducted to assess the environmental, economic and social impacts of the developed products.\nThe adoption of the effective and cost-efficient biopesticide will have significant impacts with a potential ROI of 30 % in just 5 years and a total EBITDA of more than € 6,400,000.","totalCost":"4402772,5","ecMaxContribution":"3069653","call":"H2020-BBI-JTI-2019","fundingScheme":"BBI-IA-DEMO","coordinator":"BIOVITIS","coordinatorCountry":"FR","participants":"MERCIER FRERES SARL;FUNDACION TECNALIA RESEARCH & INNOVATION;LAMBERTI SPA;EURION CONSULTING;CIAOTECH Srl;STOWARZYSZENIE ZACHODNIOPOMORSKI KLASTER CHEMICZNY ZIELONA CHEMIA;NORDZUCKER AG;INSTITUT NATIONAL DE RECHERCHE POUR L'AGRICULTURE, L'ALIMENTATION ET L'ENVIRONNEMENT;INSTITUT FRANCAIS DE LA VIGNE ET DU VIN","participantCountries":"FR;ES;IT;PL;DE","subjects":""} {"rcn":"229276","id":"895426","acronym":"DisMoBoH","status":"SIGNED","programme":"H2020-EU.1.3.2.","topics":"MSCA-IF-2019","frameworkProgramme":"H2020","title":"Dissecting the molecular building principles of locally formed transcriptional hubs","startDate":"2021-09-01","endDate":"2023-08-31","projectUrl":"","objective":"Numerous DNA variants have already been identified that modulate inter-individual molecular traits – most prominently gene expression. However, since finding mechanistic interpretations relating genotype to phenotype has proven challenging, the focus has shifted to higher-order regulatory features, i.e. chromatin accessibility, transcription factor (TF) binding and 3D chromatin interactions. This revealed at least two enhancer types: “lead” enhancers in which the presence of genetic variants modulates the activity of entire chromatin domains, and “dependent” ones in which variants induce subtle changes, affecting DNA accessibility, but not transcription. Although cell type-specific TFs are likely important, it remains unclear which sequence features are required to establish such enhancer hierarchies, and under which circumstances genetic variation results in altered enhancer-promoter contacts and differential gene expression. Here, we propose to investigate the molecular mechanisms that link DNA variation to TF binding, chromatin topology, and gene expression response. We will leverage data on enhancer hierarchy and sequence-specific TF binding to identify the sequence signatures that define “lead” enhancers. The results will guide the design of a synthetic locus that serves as an in vivo platform to systematically vary the building blocks of local transcriptional units: i) DNA sequence – including variations in TF binding site affinity and syntax, ii) molecular interactions between TFs, and iii) chromatin conformation. To validate our findings, we will perform optical reconstruction of chromatin architecture for a select number of DNA variants. By simultaneously perturbing co-dependent features, this proposal will provide novel mechanistic insights into the formation of local transcriptional hubs.","totalCost":"191149,44","ecMaxContribution":"191149,44","call":"H2020-MSCA-IF-2019","fundingScheme":"MSCA-IF-EF-RI","coordinator":"ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE","coordinatorCountry":"CH","participants":"","participantCountries":"","subjects":""} {"rcn":"229288","id":"898218","acronym":"devUTRs","status":"SIGNED","programme":"H2020-EU.1.3.2.","topics":"MSCA-IF-2019","frameworkProgramme":"H2020","title":"Uncovering the roles of 5′UTRs in translational control during early zebrafish development","startDate":"2021-09-01","endDate":"2023-08-31","projectUrl":"","objective":"Following fertilisation, metazoan embryos are transcriptionally silent, and embryogenesis is controlled by maternally deposited factors. Developmental progression requires the synthesis of new mRNAs and proteins in a coordinated fashion. Many posttranscriptional mechanisms regulate the fate of maternal mRNAs, but it is less understood how translational control shapes early embryogenesis. In eukaryotes, translation starts at the mRNA 5′ end, consisting of the 5′ cap and 5′ untranslated region (UTR). Protein synthesis is primarily regulated at the translation initiation step by elements within the 5′UTR. However, the role of 5′UTRs in regulating the dynamics of mRNA translation during vertebrate embryogenesis remains unexplored. For example, all vertebrate ribosomal protein (RP) mRNAs harbor a conserved terminal oligopyrimidine tract (TOP) in their 5′UTR. RP levels must be tightly controlled to ensure proper organismal development, but if and how the TOP motif mediates RP mRNA translational regulation during embryogenesis is unclear. Overall, we lack a systematic understanding of the regulatory information contained in 5′UTRs. In this work, I aim to uncover the 5′UTR in vivo rules for mRNA translational regulation during zebrafish embryogenesis. I propose to apply imaging and biochemical approaches to characterise the role of the TOP motif in RP mRNA translational regulation during embryogenesis and identify the trans-acting factor(s) that bind(s) to it (Aim 1). To systematically assess the contribution of 5′UTRs to mRNA translational regulation during zebrafish embryogenesis, I will couple a massively parallel reporter assay of 5′UTRs to polysome profiling (Aim 2). By integrating the translational behaviour of 5′UTR reporters throughout embryogenesis with sequence-based regression models, I anticipate to uncover novel cis-regulatory elements in 5′UTRs with developmental roles.","totalCost":"191149,44","ecMaxContribution":"191149,44","call":"H2020-MSCA-IF-2019","fundingScheme":"MSCA-IF-EF-ST","coordinator":"UNIVERSITAT BASEL","coordinatorCountry":"CH","participants":"","participantCountries":"","subjects":""} @@ -13,4 +13,5 @@ {"rcn":"229239","id":"887259","acronym":"ALEHOOP","status":"SIGNED","programme":"H2020-EU.2.1.4.","topics":"BBI-2019-SO3-D3","frameworkProgramme":"H2020","title":"Biorefineries for the valorisation of macroalgal residual biomass and legume processing by-products to obtain new protein value chains for high-value food and feed applications","startDate":"2020-06-01","endDate":"2024-05-31","projectUrl":"","objective":"ALEHOOP provides the demonstration at pilot scale of both sustainable macroalgae and legume-based biorefineries for the recovery of low-cost dietary proteins from alga-based and plant residual biomass and their validation to meet market requirements of consumers and industry in the food and feed sectors. In these sectors, consumers are demanding affordable functional natural proteins from alternative sources and industry is demanding low-cost bio-based protein formulations with better performance and higher sustainability. \nCurrent protein demand for the 7.3 billion inhabitants of the world is approximately 202 Mt. Due to the rise in meat consumption more proteins are therefore required for animal feeding. To satisfy the current protein demand, Europe imports over 30 Mt of soy from the Americas each year mainly for animal feeding, entailing 95% dependency of EU on imported soy. Current sources of proteins are becoming unsustainable from an economic and environmental perspective for Europe resulting in concerns for sustainability and food security and leading to search for new alternative proteins. \nALEHOOP addresses the obtaining of proteins from green macroalgal blooms, brown seaweed by-products from algae processors and legume processing by-products (peas, lupines, beans and lentils) as alternative protein sources for animal feeding (case of green seaweed) and food applications (case of brown seaweed and legume by-products), since they are low cost and under-exploited biomass that do not compete with traditional food crops for space and resources. This will reduce EU´s dependency on protein imports and contribute to our raw material security. The new proteins will be validated in foods for elderly, sporty and overweight people, vegetarians and healthy consumers as well as for animal feed creating cross-sectorial interconnection between these value chains and supporting the projected business plan.","totalCost":"6718370","ecMaxContribution":"5140274,41","call":"H2020-BBI-JTI-2019","fundingScheme":"BBI-IA-DEMO","coordinator":"CONTACTICA S.L.","coordinatorCountry":"ES","participants":"CENTIV GMBH;ALGINOR ASA;FUNDACION TECNALIA RESEARCH & INNOVATION;INDUKERN,S.A.;ASOCIACION NACIONAL DE FABRICANTES DE CONSERVAS DE PESCADOS Y MARISCOS-CENTRO TECNICO NACIONAL DE CONSERVACION DE PRODUCTOS DE LA PESCA;BIOZOON GMBH;EIGEN VERMOGEN VAN HET INSTITUUT VOOR LANDBOUW- EN VISSERIJONDERZOEK;BIOSURYA SL;VYZKUMNY USTAV VETERINARNIHO LEKARSTVI;NUTRITION SCIENCES;TECHNOLOGICAL UNIVERSITY DUBLIN;GARLAN, S.COOP.;ISANATUR SPAIN SL;UNIVERSIDAD DE VIGO;UNIVERSIDAD DE CADIZ","participantCountries":"DE;NO;ES;BE;CZ;IE","subjects":""} {"rcn":"229239","id":"887259","acronym":"ALEHOOP","status":"SIGNED","programme":"H2020-EU.3.2.6.","topics":"BBI-2019-SO3-D3","frameworkProgramme":"H2020","title":"Biorefineries for the valorisation of macroalgal residual biomass and legume processing by-products to obtain new protein value chains for high-value food and feed applications","startDate":"2020-06-01","endDate":"2024-05-31","projectUrl":"","objective":"ALEHOOP provides the demonstration at pilot scale of both sustainable macroalgae and legume-based biorefineries for the recovery of low-cost dietary proteins from alga-based and plant residual biomass and their validation to meet market requirements of consumers and industry in the food and feed sectors. In these sectors, consumers are demanding affordable functional natural proteins from alternative sources and industry is demanding low-cost bio-based protein formulations with better performance and higher sustainability. \nCurrent protein demand for the 7.3 billion inhabitants of the world is approximately 202 Mt. Due to the rise in meat consumption more proteins are therefore required for animal feeding. To satisfy the current protein demand, Europe imports over 30 Mt of soy from the Americas each year mainly for animal feeding, entailing 95% dependency of EU on imported soy. Current sources of proteins are becoming unsustainable from an economic and environmental perspective for Europe resulting in concerns for sustainability and food security and leading to search for new alternative proteins. \nALEHOOP addresses the obtaining of proteins from green macroalgal blooms, brown seaweed by-products from algae processors and legume processing by-products (peas, lupines, beans and lentils) as alternative protein sources for animal feeding (case of green seaweed) and food applications (case of brown seaweed and legume by-products), since they are low cost and under-exploited biomass that do not compete with traditional food crops for space and resources. This will reduce EU´s dependency on protein imports and contribute to our raw material security. The new proteins will be validated in foods for elderly, sporty and overweight people, vegetarians and healthy consumers as well as for animal feed creating cross-sectorial interconnection between these value chains and supporting the projected business plan.","totalCost":"6718370","ecMaxContribution":"5140274,41","call":"H2020-BBI-JTI-2019","fundingScheme":"BBI-IA-DEMO","coordinator":"CONTACTICA S.L.","coordinatorCountry":"ES","participants":"CENTIV GMBH;ALGINOR ASA;FUNDACION TECNALIA RESEARCH & INNOVATION;INDUKERN,S.A.;ASOCIACION NACIONAL DE FABRICANTES DE CONSERVAS DE PESCADOS Y MARISCOS-CENTRO TECNICO NACIONAL DE CONSERVACION DE PRODUCTOS DE LA PESCA;BIOZOON GMBH;EIGEN VERMOGEN VAN HET INSTITUUT VOOR LANDBOUW- EN VISSERIJONDERZOEK;BIOSURYA SL;VYZKUMNY USTAV VETERINARNIHO LEKARSTVI;NUTRITION SCIENCES;TECHNOLOGICAL UNIVERSITY DUBLIN;GARLAN, S.COOP.;ISANATUR SPAIN SL;UNIVERSIDAD DE VIGO;UNIVERSIDAD DE CADIZ","participantCountries":"DE;NO;ES;BE;CZ;IE","subjects":""} {"rcn":"229258","id":"892834","acronym":"DENVPOC","status":"SIGNED","programme":"H2020-EU.1.3.2.","topics":"MSCA-IF-2019","frameworkProgramme":"H2020","title":"qPCR Microfluidics point-of-care platform for dengue diagnosis","startDate":"2020-05-18","endDate":"2022-05-17","projectUrl":"","objective":"As a result of Global climate change and fast urbanization, global outbreaks of Dengue (DENV)/ Zika(ZIKV)/Chikungunya(CHIKV) virus have the potential to occur. The most common pathway of these infections in humans is through the female Aedes mosquito vector. DENV is an exanthematous febrile disease with varied clinical manifestations and progressions . Due to similarities in symptoms between DENV and ZIKV and CHIKV, it is difficult to make a differential diagnosis, impeding appropriate, timely medical intervention. Furthermore, cross-reactivity with ZIKV, which was recently related to microcephaly, is a serious issue. In 2016, in Brazil alone, there were 4180 microcephaly cases reported instead of 163 cases, more in line with yearly expected projections , , Thus, the sooner an accurate diagnostic which differentiates DENV from the other manifestations is critical; most especially at the early stages of the infection, to have a reliable diagnosis in pregnant women. In 2016, the OMS emergency committee declared that the outbreaks and the potentially resultant neurological disorders in Brazil were an important international state of emergency in public health, as a result of the associated secondary effects; these diseases became a Global concern. This project allows developing a highly and fast Multiplex qPCR POC platform by using FASTGENE technology with a minimal amount of patient serotype. It would reduce the time of analysis (30 to 90’ for a standard) and costs. Additionally, the sample preprocessing and thermalization will shorten real-time PCR amplification time and will be integrated within the microfluidic systems. This platform can result in a commercialized product whereupon a main market target would be pregnant women and people living or traveling through/from outbreak risk areas.","totalCost":"196707,84","ecMaxContribution":"196707,84","call":"H2020-MSCA-IF-2019","fundingScheme":"MSCA-IF-EF-SE","coordinator":"BFORCURE","coordinatorCountry":"FR","participants":"","participantCountries":"","subjects":""} -{"rcn":"229280","id":"895716","acronym":"DoMiCoP","status":"SIGNED","programme":"H2020-EU.1.3.2.","topics":"MSCA-IF-2019","frameworkProgramme":"H2020","title":"The Diffusion of Migration Control Practice. Actors, Processes and Effects.","startDate":"2021-03-01","endDate":"2023-02-28","projectUrl":"","objective":"DoMiCoP develops new understandings and perspectives to study migration control in practice in the European Union by asking one main question: how and why do communities of practice develop and diffuse the knowledge required to put migration control into action? Unlike the nexus between expert knowledge, epistemic communities and policy formulation, the nexus between everyday knowledge, communities of practice and policy implementation has not yet received systematic scholarly attention. My project bridges that gap by focusing on intermediate arenas in which communities of practice take shape most notably the meetings and trainings that gather state and non-state actors involved in putting asylum, detention and removal into practice. By building on field-based methodologies (interviews and participant observations), DoMiCoP sheds ethnographic light on the role that ‘learning from abroad’ plays in the implementation of migration control in the EU. My project’s aim is threefold: 1) Identifying arenas at intermediate levels in which communities of practice take shape; 2) Analysing the communities of practice by focusing on the configurations of actors and organizations involved, the motivations underlying their involvement, the process of knowledge development in interaction, the conflicts and negotiations; 3) Revealing the role of non-state organizations (private for profit and not-for-profit). From a theoretical point of view, this project goes beyond the classical view of the implementation as a test to assess the effectiveness of policy transfers towards an analysis of policy transfer at that level of policy-making. From an empirical point of view, the project expands knowledge about less-studied venues of policy-making and provides original thick descriptions. From a methodological point of view, the project engages with qualitative methods for the study of policy diffusion and aims at responding to their main challenges through participant observation.","totalCost":"163673,28","ecMaxContribution":"163673,28","call":"H2020-MSCA-IF-2019","fundingScheme":"MSCA-IF-EF-ST","coordinator":"EUROPEAN UNIVERSITY INSTITUTE","coordinatorCountry":"IT","participants":"","participantCountries":"","subjects":""} \ No newline at end of file +{"rcn":"229280","id":"895716","acronym":"DoMiCoP","status":"SIGNED","programme":"H2020-EU.1.3.2.","topics":"MSCA-IF-2019","frameworkProgramme":"H2020","title":"The Diffusion of Migration Control Practice. Actors, Processes and Effects.","startDate":"2021-03-01","endDate":"2023-02-28","projectUrl":"","objective":"DoMiCoP develops new understandings and perspectives to study migration control in practice in the European Union by asking one main question: how and why do communities of practice develop and diffuse the knowledge required to put migration control into action? Unlike the nexus between expert knowledge, epistemic communities and policy formulation, the nexus between everyday knowledge, communities of practice and policy implementation has not yet received systematic scholarly attention. My project bridges that gap by focusing on intermediate arenas in which communities of practice take shape most notably the meetings and trainings that gather state and non-state actors involved in putting asylum, detention and removal into practice. By building on field-based methodologies (interviews and participant observations), DoMiCoP sheds ethnographic light on the role that ‘learning from abroad’ plays in the implementation of migration control in the EU. My project’s aim is threefold: 1) Identifying arenas at intermediate levels in which communities of practice take shape; 2) Analysing the communities of practice by focusing on the configurations of actors and organizations involved, the motivations underlying their involvement, the process of knowledge development in interaction, the conflicts and negotiations; 3) Revealing the role of non-state organizations (private for profit and not-for-profit). From a theoretical point of view, this project goes beyond the classical view of the implementation as a test to assess the effectiveness of policy transfers towards an analysis of policy transfer at that level of policy-making. From an empirical point of view, the project expands knowledge about less-studied venues of policy-making and provides original thick descriptions. From a methodological point of view, the project engages with qualitative methods for the study of policy diffusion and aims at responding to their main challenges through participant observation.","totalCost":"163673,28","ecMaxContribution":"163673,28","call":"H2020-MSCA-IF-2019","fundingScheme":"MSCA-IF-EF-ST","coordinator":"EUROPEAN UNIVERSITY INSTITUTE","coordinatorCountry":"IT","participants":"","participantCountries":"","subjects":""} +{"rcn":"230066","id":"883730","acronym":"SOLSPACE","status":"SIGNED","programme":"H2020-EU.1.1.","topics":"ERC-2019-ADG","frameworkProgramme":"H2020","title":"Enhancing Global Clean Energy Services Using Orbiting Solar Reflectors", "startDate":"2021-03-01","endDate":"2025-11-30","projectUrl":"","objective":"fake", "totalCost":"2496392","ecMaxContribution":"2496392","call":"ERC-2019-ADG","fundingScheme":"ERC-ADG","coordinator":"UNIVERSITY OF GLASGOW","coordinatorCountry":"UK","participants":"","participantCountries":"","subjects":""} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/topic.json.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/topic.json.gz new file mode 100644 index 000000000..623c92427 Binary files /dev/null and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/topic.json.gz differ diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz index 85440ceed..60c3bf05a 100644 Binary files a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz differ diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index 8028d5a94..f7e2e9ae0 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.dedup; -import java.io.Serializable; import java.util.Collection; import java.util.Iterator; import java.util.List; @@ -18,6 +17,7 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Lists; +import eu.dnetlib.dhp.oa.merge.AuthorMerger; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java index 3fb9d1751..30cfebe79 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -14,6 +14,7 @@ import org.codehaus.jackson.map.ObjectMapper; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.oa.merge.AuthorMerger; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; @@ -65,7 +66,7 @@ public class EntityMergerTest implements Serializable { assertEquals(dedupId, pub_merged.getId()); assertEquals(pub_merged.getJournal(), pub_top.getJournal()); - assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright()); + assertEquals(pub_merged.getBestaccessright().getClassid(), "OPEN"); assertEquals(pub_merged.getResulttype(), pub_top.getResulttype()); assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage()); assertEquals(pub_merged.getPublisher(), pub_top.getPublisher()); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index fb5ebc099..2c1607165 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -194,10 +194,10 @@ public class SparkDedupTest implements Serializable { .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") .count(); - assertEquals(3432, orgs_simrel); - assertEquals(7152, pubs_simrel); + assertEquals(3082, orgs_simrel); + assertEquals(7036, pubs_simrel); assertEquals(344, sw_simrel); - assertEquals(458, ds_simrel); + assertEquals(442, ds_simrel); assertEquals(6750, orp_simrel); } @@ -343,8 +343,8 @@ public class SparkDedupTest implements Serializable { .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") .count(); - assertEquals(1276, orgs_mergerel); - assertEquals(1442, pubs_mergerel); + assertEquals(1272, orgs_mergerel); + assertEquals(1438, pubs_mergerel); assertEquals(288, sw_mergerel); assertEquals(472, ds_mergerel); assertEquals(718, orp_mergerel); @@ -390,10 +390,10 @@ public class SparkDedupTest implements Serializable { testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") .count(); - assertEquals(82, orgs_deduprecord); - assertEquals(66, pubs_deduprecord); + assertEquals(85, orgs_deduprecord); + assertEquals(65, pubs_deduprecord); assertEquals(51, sw_deduprecord); - assertEquals(96, ds_deduprecord); + assertEquals(97, ds_deduprecord); assertEquals(89, orp_deduprecord); } @@ -473,12 +473,12 @@ public class SparkDedupTest implements Serializable { .distinct() .count(); - assertEquals(897, publications); - assertEquals(835, organizations); + assertEquals(896, publications); + assertEquals(838, organizations); assertEquals(100, projects); assertEquals(100, datasource); assertEquals(200, softwares); - assertEquals(388, dataset); + assertEquals(389, dataset); assertEquals(517, otherresearchproduct); long deletedOrgs = jsc @@ -533,7 +533,7 @@ public class SparkDedupTest implements Serializable { long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); - assertEquals(4866, relations); + assertEquals(4858, relations); // check deletedbyinference final Dataset mergeRels = spark diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java index 7e76c284b..31de8d951 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java @@ -168,10 +168,10 @@ public class SparkStatsTest implements Serializable { .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats") .count(); - assertEquals(121, orgs_blocks); - assertEquals(110, pubs_blocks); - assertEquals(21, sw_blocks); - assertEquals(67, ds_blocks); - assertEquals(55, orp_blocks); + assertEquals(549, orgs_blocks); + assertEquals(299, pubs_blocks); + assertEquals(122, sw_blocks); + assertEquals(186, ds_blocks); + assertEquals(170, orp_blocks); } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json index ae688e746..2e568e050 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json @@ -1,3 +1,3 @@ -{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "sysimport:crosswalk:datasetarchive", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.95"}, "resourcetype": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "pid": [], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}], "id": "50|a89337edbe55::4930db9e954866d70916cbfba9f81f97", "subject": [], "instance": [{"refereed": null, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "url": [], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0000-0656-9999"},{"qualifier": {"classid": "id", "classname": "id"}, "value": "987654321"}], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}], "source": [], "dateofcollection": "2019-11-05T14:49:22.351Z", "fulltext": [], "dateoftransformation": "2019-11-05T16:10:58.988Z", "description": [], "format": [], "journal": {"issnPrinted": "1459-6067", "conferencedate": "", "conferenceplace": "", "name": "Agricultural and Food Science", "edition": "", "iss": "3", "sp": "", "vol": "27", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "1795-1895", "ep": "", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": [], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2018-09-30"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Altered brain activation in a reversal learning task unmasks adaptive changes in cognitive control in writer's cramp"}]} +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "sysimport:crosswalk:datasetarchive", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.95"}, "resourcetype": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "pid": [], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}], "id": "50|a89337edbe55::4930db9e954866d70916cbfba9f81f97", "subject": [], "instance": [{"refereed": null, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "url": [], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0000-0656-9999"},{"qualifier": {"classid": "id", "classname": "id"}, "value": "987654321"}], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}], "source": [], "dateofcollection": "2019-11-05T14:49:22.351Z", "fulltext": [], "dateoftransformation": "2019-11-05T16:10:58.988Z", "description": [], "format": [], "journal": {"issnPrinted": "1459-6067", "conferencedate": "", "conferenceplace": "", "name": "Agricultural and Food Science", "edition": "", "iss": "3", "sp": "", "vol": "27", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "1795-1895", "ep": "", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "12MONTHS", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": [], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2018-09-30"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Altered brain activation in a reversal learning task unmasks adaptive changes in cognitive control in writer's cramp"}]} {"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.9"}, "resourcetype": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "doi", "classname": "doi", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1016/j.nicl.2015.11.006"}], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "BASE (Open Access Aggregator)", "key": "10|openaire____::df45502607927471ecf8a6ae83683ff5"}], "id": "50|base_oa_____::0968af610a356656706657e4f234b340", "subject": [], "instance": [{"refereed": null, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "NeuroImage: Clinical", "key": "10|doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "http://creativecommons.org/licenses/by-nc-nd/4.0/"}, "url": ["http://dx.doi.org/10.1016/j.nicl.2015.11.006"], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "BASE (Open Access Aggregator)", "key": "10|openaire____::df45502607927471ecf8a6ae83683ff5"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Zeuner", "name": "Kirsten E.", "pid": [], "rank": 1, "affiliation": [], "fullname": "Zeuner, Kirsten E."}, {"surname": "Knutzen", "name": "Arne", "pid": [], "rank": 2, "affiliation": [], "fullname": "Knutzen, Arne"}, {"surname": "Granert", "name": "Oliver", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0002-0656-1023"}], "rank": 3, "affiliation": [], "fullname": "Granert, Oliver"}, {"surname": "Sablowsky", "name": "Simone", "pid": [], "rank": 4, "affiliation": [], "fullname": "Sablowsky, Simone"}, {"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}, {"surname": "Klein", "name": "Christine", "pid": [], "rank": 10, "affiliation": [], "fullname": "Klein, Christine"}, {"surname": "Deuschl", "name": "Gu\\u0308nther", "pid": [], "rank": 11, "affiliation": [], "fullname": "Deuschl, G\\u00fcnther"}, {"surname": "Eimeren", "name": "Thilo", "pid": [], "rank": 12, "affiliation": [], "fullname": "van Eimeren, Thilo"}, {"surname": "Witt", "name": "Karsten", "pid": [], "rank": 13, "affiliation": [], "fullname": "Witt, Karsten"}], "source": [], "dateofcollection": "2017-07-27T19:04:09.131Z", "fulltext": [], "dateoftransformation": "2019-01-23T10:15:19.582Z", "description": [], "format": [], "journal": {"issnPrinted": "2213-1582", "conferencedate": "", "conferenceplace": "", "name": "NeuroImage: Clinical", "edition": "", "iss": "", "sp": "63", "vol": "10", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "", "ep": "70", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Elsevier BV"}, "language": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [{"classid": "IT", "classname": "Italy", "schemeid": "dnet:countries", "schemename": "dnet:countries"}], "extraInfo": [], "originalId": ["10.1016/j.nicl.2015.11.006"], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Altered brain activation in a reversal learning task unmasks adaptive changes in cognitive control in writer's cramp"}]} {"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "sysimport:crosswalk:datasetarchive", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.9"}, "resourcetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}, "pid": [], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}], "id": "50|CrisUnsNoviS::9f9d014eea45dab432cab636c4c9cf39", "subject": [], "instance": [{"refereed": null, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "url": ["https://www.cris.uns.ac.rs/record.jsf?recordId=113444&source=OpenAIRE&language=en"], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2019-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}, "accessright": {"classid": "UNKNOWN", "classname": "UNKNOWN", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Zeuner", "name": "Kirsten E.", "pid": [], "rank": 1, "affiliation": [], "fullname": "Zeuner, Kirsten E."}, {"surname": "Knutzen", "name": "Arne", "pid": [], "rank": 2, "affiliation": [], "fullname": "Knutzen, Arne"}, {"surname": "Granert", "name": "Oliver", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0002-0656-1023"}, {"qualifier": {"classid": "pubmed", "classname": "pubmed"}, "value": "pubmed.it"}], "rank": 3, "affiliation": [], "fullname": "Granert, Oliver"}, {"surname": "Sablowsky", "name": "Simone", "pid": [{"qualifier": {"classid": "id", "classname": "id"}, "value": "12345678"}], "rank": 4, "affiliation": [], "fullname": "Sablowsky, Simone"}, {"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0000-0656-1023"},{"qualifier": {"classid": "id", "classname": "id"}, "value": "987654321"}], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}], "source": [], "dateofcollection": "2020-03-10T15:05:38.685Z", "fulltext": [], "dateoftransformation": "2020-03-11T20:11:13.15Z", "description": [], "format": [], "journal": {"issnPrinted": "", "conferencedate": "", "conferenceplace": "", "name": "", "edition": "", "iss": "", "sp": "", "vol": "", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "", "ep": "", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "language": {"classid": "en", "classname": "en", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [{"classid": "FI", "classname": "Finland", "schemeid": "dnet:countries", "schemename": "dnet:countries"}], "extraInfo": [], "originalId": ["(BISIS)113444", "https://www.cris.uns.ac.rs/record.jsf?recordId=113444&source=OpenAIRE&language=en"], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "test title", "classname": "test title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Antichains of copies of ultrahomogeneous structures"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala index 772af010f..a29809fc0 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala @@ -1,13 +1,15 @@ package eu.dnetlib.doiboost import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset, Organization} +import eu.dnetlib.dhp.oa.merge.AuthorMerger +import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset} import eu.dnetlib.doiboost.mag.ConversionUtil import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} + import scala.collection.JavaConverters._ object SparkGenerateDoiBoost { @@ -49,6 +51,7 @@ object SparkGenerateDoiBoost { val otherPub = item._2._2 if (otherPub != null) { crossrefPub.mergeFrom(otherPub) + crossrefPub.setAuthor(AuthorMerger.mergeAuthor(crossrefPub.getAuthor, otherPub.getAuthor)) } } crossrefPub diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index b38e103bc..096217a55 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -14,6 +14,8 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex +case class CrossrefDT(doi: String, json:String) {} + case class mappingAffiliation(name: String) {} case class mappingAuthor(given: Option[String], family: String, ORCID: Option[String], affiliation: Option[mappingAffiliation]) {} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala new file mode 100644 index 000000000..996ba5585 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala @@ -0,0 +1,93 @@ +package eu.dnetlib.doiboost.crossref + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import org.apache.commons.io.IOUtils +import org.apache.spark.SparkConf +import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.json4s +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods.parse +import org.slf4j.{Logger, LoggerFactory} + +object CrossrefDataset { + + + def extractTimestamp(input:String): Long = { + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: json4s.JValue = parse(input) + + (json\"indexed"\"timestamp").extractOrElse[Long](0) + + } + + + def main(args: Array[String]): Unit = { + + + val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"))) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(SparkMapDumpIntoOAF.getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + import spark.implicits._ + + + val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable { + + override def zero: CrossrefDT = null + + override def reduce(b: CrossrefDT, a: CrossrefDT): CrossrefDT = { + if (b == null) + return a + if (a == null) + return b + + val tb = extractTimestamp(b.json) + val ta = extractTimestamp(a.json) + if(ta >tb) { + return a + } + b + } + + override def merge(a: CrossrefDT, b: CrossrefDT): CrossrefDT = { + if (b == null) + return a + if (a == null) + return b + + val tb = extractTimestamp(b.json) + val ta = extractTimestamp(a.json) + if(ta >tb) { + return a + } + b + } + + override def bufferEncoder: Encoder[CrossrefDT] = implicitly[Encoder[CrossrefDT]] + + override def outputEncoder: Encoder[CrossrefDT] = implicitly[Encoder[CrossrefDT]] + + override def finish(reduction: CrossrefDT): CrossrefDT = reduction + } + + val sourcePath:String = parser.get("sourcePath") + val targetPath:String = parser.get("targetPath") + + val ds:Dataset[CrossrefDT] = spark.read.load(sourcePath).as[CrossrefDT] + + ds.groupByKey(_.doi) + .agg(crossrefAggregator.toColumn) + .map(s=>s._2) + .write.mode(SaveMode.Overwrite).save(targetPath) + + } + +} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml index db4ac96f9..be4a45afe 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml @@ -46,11 +46,11 @@ ${jobTracker} ${nameNode} eu.dnetlib.doiboost.crossref.CrossrefImporter - -t${workingPath}/input/crossref/index_dump + -t${workingPath}/input/crossref/index_dump_1 -n${nameNode} -ts${timestamp} - + @@ -68,7 +68,7 @@ --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} - --sourcePath${workingPath}/input/crossref/index_dump,${workingPath}/crossref/index_dump + --sourcePath${workingPath}/input/crossref/index_dump,${workingPath}/input/crossref/index_dump_1,${workingPath}/crossref/index_dump --targetPath${workingPath}/input/crossref --masteryarn-cluster @@ -76,5 +76,28 @@ + + + + + yarn-cluster + cluster + ExtractCrossrefToOAF + eu.dnetlib.doiboost.crossref.CrossrefDataset + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + ${sparkExtraOPT} + + --sourcePath/data/doiboost/crossref/cr_dataset + --targetPath/data/doiboost/crossref/crossrefDataset + --masteryarn-cluster + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json new file mode 100644 index 000000000..312bd0751 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json @@ -0,0 +1,6 @@ +[ + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true}, + {"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true} + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml index bf91958cf..e35f88abd 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml @@ -89,7 +89,7 @@ --dbPublicationPath${workingDirPath}/doiBoostPublicationFiltered --dbDatasetPath${workingDirPath}/crossrefDataset - --crossRefRelation/data/doiboost/input/crossref/relations + --crossRefRelation${workingDirPath}/crossrefRelation --dbaffiliationRelationPath${workingDirPath}/doiBoostPublicationAffiliation -do${workingDirPath}/doiBoostOrganization --targetPath${workingDirPath}/actionDataSet diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala index c393f0ae9..243719549 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala @@ -1,54 +1,54 @@ package eu.dnetlib.dhp.doiboost -import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, StructuredProperty, Dataset => OafDataset} -import org.apache.spark.sql.functions.{col, sum} -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} -import scala.:: +import eu.dnetlib.dhp.schema.oaf.Publication +import org.apache.spark.SparkContext +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} +import org.json4s +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods._ + import scala.collection.JavaConverters._ class QueryTest { + def extract_payload(input:String) :String = { - def extractLicense(p:Publication):Tuple2[String,String] = { - - val tmp = p.getInstance().asScala.map(i => i.getLicense.getValue).distinct.mkString(",") - (p.getId,tmp) - } + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: json4s.JValue = parse(input) - - def hasDOI(publication: Publication, doi:String):Boolean = { + compact(render((json \ "payload"))) - val s = publication.getOriginalId.asScala.filter(i => i.equalsIgnoreCase(doi)) - - s.nonEmpty } - def hasNullHostedBy(publication: Publication):Boolean = { - publication.getInstance().asScala.exists(i => i.getHostedby == null || i.getHostedby.getValue == null) + def hasInstanceWithUrl(p:Publication):Boolean = { + val c = p.getInstance.asScala.map(i => i.getUrl!= null && !i.getUrl.isEmpty).size + !(!p.getInstance.isEmpty && c == p.getInstance().size) } + def hasNullAccessRights(p:Publication):Boolean = { + val c = p.getInstance.asScala.map(i => i.getAccessright!= null && i.getAccessright.getClassname.nonEmpty).size + !p.getInstance.isEmpty && c == p.getInstance().size() + } - def myQuery(spark:SparkSession): Unit = { + + def myQuery(spark:SparkSession, sc:SparkContext): Unit = { implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] - implicit val mapEncoderDat: Encoder[OafDataset] = Encoders.kryo[OafDataset] - implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation] - val doiboostPubs:Dataset[Publication] = spark.read.load("/data/doiboost/process/doiBoostPublicationFiltered").as[Publication] + val mapper = new ObjectMapper() + mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) - val relFunder: Dataset[Relation] = spark.read.format("org.apache.spark.sql.parquet").load("/data/doiboost/process/crossrefRelation").as[Relation] - doiboostPubs.filter(p => p.getDateofacceptance != null && p.getDateofacceptance.getValue!= null && p.getDateofacceptance.getValue.length > 0 ) - - doiboostPubs.filter(p=>hasDOI(p, "10.1016/j.is.2020.101522")).collect()(0).getDescription.get(0).getValue + val ds:Dataset[Publication] = spark.read.load("/tmp/p").as[Publication] - doiboostPubs.filter(p=> hasNullHostedBy(p)).count() + ds.filter(p =>p.getBestaccessright!= null && p.getBestaccessright.getClassname.nonEmpty).count() + - doiboostPubs.map(p=> (p.getId, p.getBestaccessright.getClassname))(Encoders.tuple(Encoders.STRING,Encoders.STRING)) } } diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala index f62ac2b67..a3bb2a4f4 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala @@ -19,8 +19,6 @@ class CrossrefMappingTest { - - @Test def testFunderRelationshipsMapping(): Unit = { val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index a0a334e3c..3e1d84c01 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -83,7 +83,6 @@ dhp-schemas ${project.version} - com.jayway.jsonpath json-path diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index f615d69f2..84f88003b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -6,9 +6,10 @@ import java.util.Objects; import java.util.function.Function; import java.util.stream.Collectors; -import com.clearspring.analytics.util.Lists; import org.apache.commons.lang3.StringUtils; +import com.clearspring.analytics.util.Lists; + import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -144,22 +145,29 @@ public class CleaningFunctions { author.setRank(i++); } } - for(Author a : r.getAuthor()) { + for (Author a : r.getAuthor()) { if (Objects.isNull(a.getPid())) { a.setPid(Lists.newArrayList()); } else { - a.setPid( - a.getPid().stream() - .filter(p -> Objects.nonNull(p.getQualifier())) - .filter(p -> StringUtils.isNotBlank(p.getValue())) - .map(p -> { - p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); - return p; - }) - .collect(Collectors.toMap(StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, LinkedHashMap::new)) - .values() - .stream() - .collect(Collectors.toList())); + a + .setPid( + a + .getPid() + .stream() + .filter(p -> Objects.nonNull(p.getQualifier())) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .map(p -> { + p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); + return p; + }) + .collect( + Collectors + .toMap( + StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, + LinkedHashMap::new)) + .values() + .stream() + .collect(Collectors.toList())); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/DumpGraphEntities.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/DumpGraphEntities.java index 24b3ad751..5e329c978 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/DumpGraphEntities.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/DumpGraphEntities.java @@ -379,13 +379,17 @@ public class DumpGraphEntities implements Serializable { } project - .setProgramme( + .setH2020Classifications( Optional - .ofNullable(p.getProgramme()) + .ofNullable(p.getH2020classification()) .map( - programme -> programme + classification -> classification .stream() - .map(pg -> Programme.newInstance(pg.getCode(), pg.getDescription())) + .map( + c -> H2020Classification + .newInstance( + c.getH2020Programme().getCode(), c.getH2020Programme().getDescription(), + c.getLevel1(), c.getLevel2(), c.getLevel3(), c.getClassification())) .collect(Collectors.toList())) .orElse(new ArrayList<>())); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/DatasourceCompatibilityComparator.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/DatasourceCompatibilityComparator.java new file mode 100644 index 000000000..59bdb3914 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/DatasourceCompatibilityComparator.java @@ -0,0 +1,97 @@ + +package eu.dnetlib.dhp.oa.graph.merge; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.oaf.Qualifier; + +public class DatasourceCompatibilityComparator implements Comparator { + @Override + public int compare(Qualifier left, Qualifier right) { + + String lClass = left.getClassid(); + String rClass = right.getClassid(); + + if (lClass.equals(rClass)) + return 0; + + if (lClass.equals("openaire-cris_1.1")) + return -1; + if (rClass.equals("openaire-cris_1.1")) + return 1; + + if (lClass.equals("openaire4.0")) + return -1; + if (rClass.equals("openaire4.0")) + return 1; + + if (lClass.equals("driver-openaire2.0")) + return -1; + if (rClass.equals("driver-openaire2.0")) + return 1; + + if (lClass.equals("driver")) + return -1; + if (rClass.equals("driver")) + return 1; + + if (lClass.equals("openaire2.0")) + return -1; + if (rClass.equals("openaire2.0")) + return 1; + + if (lClass.equals("openaire3.0")) + return -1; + if (rClass.equals("openaire3.0")) + return 1; + + if (lClass.equals("openaire2.0_data")) + return -1; + if (rClass.equals("openaire2.0_data")) + return 1; + + if (lClass.equals("native")) + return -1; + if (rClass.equals("native")) + return 1; + + if (lClass.equals("hostedBy")) + return -1; + if (rClass.equals("hostedBy")) + return 1; + + if (lClass.equals("notCompatible")) + return -1; + if (rClass.equals("notCompatible")) + return 1; + + if (lClass.equals("UNKNOWN")) + return -1; + if (rClass.equals("UNKNOWN")) + return 1; + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } + + /* + * CASE WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY + * ['openaire-cris_1.1']) THEN 'openaire-cris_1.1@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT + * COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire4.0']) THEN + * 'openaire4.0@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, + * a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0']) THEN + * 'driver-openaire2.0@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE + * (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver']) THEN + * 'driver@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, + * a.compatibility) :: TEXT) @> ARRAY ['openaire2.0']) THEN 'openaire2.0@@@dnet:datasourceCompatibilityLevel' WHEN + * (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0']) THEN + * 'openaire3.0@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, + * a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data']) THEN + * 'openaire2.0_data@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE + * (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native']) THEN + * 'native@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, + * a.compatibility) :: TEXT) @> ARRAY ['hostedBy']) THEN 'hostedBy@@@dnet:datasourceCompatibilityLevel' WHEN + * (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible']) + * THEN 'notCompatible@@@dnet:datasourceCompatibilityLevel' ELSE 'UNKNOWN@@@dnet:datasourceCompatibilityLevel' END + */ +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java index b723de955..037683604 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java @@ -3,8 +3,9 @@ package eu.dnetlib.dhp.oa.graph.merge; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.util.Objects; -import java.util.Optional; +import java.util.*; + +import javax.xml.crypto.Data; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -14,6 +15,7 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,6 +41,14 @@ public class MergeGraphSparkJob { private static final String PRIORITY_DEFAULT = "BETA"; // BETA | PROD + private static final Datasource DATASOURCE = new Datasource(); + + static { + Qualifier compatibility = new Qualifier(); + compatibility.setClassid("UNKNOWN"); + DATASOURCE.setOpenairecompatibility(compatibility); + } + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -104,6 +114,10 @@ public class MergeGraphSparkJob { .map((MapFunction, Tuple2>, P>) value -> { Optional

p = Optional.ofNullable(value._1()).map(Tuple2::_2); Optional b = Optional.ofNullable(value._2()).map(Tuple2::_2); + + if (p.orElse((P) b.orElse((B) DATASOURCE)) instanceof Datasource) { + return mergeDatasource(p, b); + } switch (priority) { default: case "BETA": @@ -119,6 +133,36 @@ public class MergeGraphSparkJob { .json(outputPath); } + /** + * Datasources involved in the merge operation doesn't obey to the infra precedence policy, but relies on a custom + * behaviour that, given two datasources from beta and prod returns the one from prod with the highest + * compatibility among the two. + * + * @param p datasource from PROD + * @param b datasource from BETA + * @param

Datasource class type from PROD + * @param Datasource class type from BETA + * @return the datasource from PROD with the highest compatibility level. + */ + protected static

P mergeDatasource(Optional

p, Optional b) { + if (p.isPresent() & !b.isPresent()) { + return p.get(); + } + if (b.isPresent() & !p.isPresent()) { + return (P) b.get(); + } + if (!b.isPresent() & !p.isPresent()) { + return null; // unlikely, at least one should be produced by the join operation + } + + Datasource dp = (Datasource) p.get(); + Datasource db = (Datasource) b.get(); + + List list = Arrays.asList(dp.getOpenairecompatibility(), db.getOpenairecompatibility()); + dp.setOpenairecompatibility(Collections.min(list, new DatasourceCompatibilityComparator())); + return (P) dp; + } + private static

P mergeWithPriorityToPROD(Optional

p, Optional b) { if (b.isPresent() & !p.isPresent()) { return (P) b.get(); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 1e7b56ee9..adf7b92be 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -38,13 +38,11 @@ import java.io.IOException; import java.sql.Array; import java.sql.ResultSet; import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.List; +import java.util.*; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Predicate; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -197,7 +195,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final Datasource ds = new Datasource(); ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); - ds.setOriginalId(Arrays.asList((String[]) rs.getArray("identities").getArray())); + ds + .setOriginalId( + Arrays + .asList( + (String[]) rs.getArray("identities").getArray()) + .stream() + .filter(StringUtils::isNotBlank) + .collect(Collectors.toList())); ds .setCollectedfrom( listKeyValues( @@ -243,7 +248,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i ds.setCertificates(field(rs.getString("certificates"), info)); ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array ds - .setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal + .setJournal( + journal( + rs.getString("officialname"), + rs.getString("issnPrinted"), + rs.getString("issnOnline"), + rs.getString("issnLinking"), + info)); // Journal ds.setDataInfo(info); ds.setLastupdatetimestamp(lastUpdateTimestamp); @@ -567,21 +578,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i return res; } - private Journal prepareJournal(final String name, final String sj, final DataInfo info) { - if (StringUtils.isNotBlank(sj)) { - final String[] arr = sj.split("@@@"); - if (arr.length == 3) { - final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null; - final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null; + private Journal prepareJournal(final ResultSet rs, final DataInfo info) throws SQLException { + if (Objects.isNull(rs)) { + return null; + } else { - final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null; - - if (issn != null || eissn != null || lissn != null) { - return journal(name, issn, eissn, lissn, null, null, null, null, null, null, null, info); - } - } + return journal( + rs.getString("officialname"), rs.getString("issnPrinted"), rs.getString("issnOnline"), + rs.getString("issnLinking"), info); } - return null; } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java index 63db13b8f..84b29e3d4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java @@ -153,6 +153,27 @@ public class OafMapperUtils { return p; } + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final DataInfo dataInfo) { + return journal( + name, + issnPrinted, + issnOnline, + issnLinking, + null, + null, + null, + null, + null, + null, + null, + dataInfo); + } + public static Journal journal( final String name, final String issnPrinted, diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala index d1bf39475..ee2dbadfd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala @@ -1,4 +1,5 @@ package eu.dnetlib.dhp.sx.ebi +import eu.dnetlib.dhp.oa.merge.AuthorMerger import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown} import org.apache.spark.sql.{Encoder, Encoders} @@ -14,6 +15,7 @@ object EBIAggregator { override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = { b.mergeFrom(a._2) + b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor)) if (b.getId == null) b.setId(a._2.getId) b @@ -22,6 +24,7 @@ object EBIAggregator { override def merge(wx: OafDataset, wy: OafDataset): OafDataset = { wx.mergeFrom(wy) + wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor)) if(wx.getId == null && wy.getId.nonEmpty) wx.setId(wy.getId) wx @@ -35,8 +38,6 @@ object EBIAggregator { Encoders.kryo(classOf[OafDataset]) } - - def getDLIUnknownAggregator(): Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown] = new Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown]{ override def zero: DLIUnknown = new DLIUnknown() @@ -69,6 +70,7 @@ object EBIAggregator { override def reduce(b: DLIDataset, a: (String, DLIDataset)): DLIDataset = { b.mergeFrom(a._2) + b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor)) if (b.getId == null) b.setId(a._2.getId) b @@ -76,6 +78,7 @@ object EBIAggregator { override def merge(wx: DLIDataset, wy: DLIDataset): DLIDataset = { wx.mergeFrom(wy) + wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor)) if(wx.getId == null && wy.getId.nonEmpty) wx.setId(wy.getId) wx @@ -96,6 +99,8 @@ object EBIAggregator { override def reduce(b: DLIPublication, a: (String, DLIPublication)): DLIPublication = { b.mergeFrom(a._2) + b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor)) + if (b.getId == null) b.setId(a._2.getId) b @@ -104,6 +109,7 @@ object EBIAggregator { override def merge(wx: DLIPublication, wy: DLIPublication): DLIPublication = { wx.mergeFrom(wy) + wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor)) if(wx.getId == null && wy.getId.nonEmpty) wx.setId(wy.getId) wx @@ -124,6 +130,7 @@ object EBIAggregator { override def reduce(b: Publication, a: (String, Publication)): Publication = { b.mergeFrom(a._2) + b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor)) if (b.getId == null) b.setId(a._2.getId) b @@ -132,6 +139,7 @@ object EBIAggregator { override def merge(wx: Publication, wy: Publication): Publication = { wx.mergeFrom(wy) + wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor)) if(wx.getId == null && wy.getId.nonEmpty) wx.setId(wy.getId) wx @@ -145,7 +153,6 @@ object EBIAggregator { Encoders.kryo(classOf[Publication]) } - def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{ override def zero: Relation = new Relation() @@ -166,10 +173,4 @@ object EBIAggregator { override def outputEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) } - - - - - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/postprocessing.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/postprocessing.sql index 778e3afd2..ea483a4a7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/postprocessing.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/postprocessing.sql @@ -1,10 +1,10 @@ DROP VIEW IF EXISTS ${hiveDbName}.result; CREATE VIEW IF NOT EXISTS ${hiveDbName}.result as - select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.publication p + select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.publication p union all - select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.dataset d + select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.dataset d union all - select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.software s + select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.software s union all - select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.otherresearchproduct o; + select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.otherresearchproduct o; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryDatasources.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryDatasources.sql index 7ca672835..f0a4161ab 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryDatasources.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryDatasources.sql @@ -84,8 +84,10 @@ SELECT dc.id AS collectedfromid, dc.officialname AS collectedfromname, d.typology||'@@@dnet:datasource_typologies' AS datasourcetype, - 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction, - concat_ws(' @@@ ', d.issn, d.eissn, d.lissn) AS journal + 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction, + d.issn AS issnPrinted, + d.eissn AS issnOnline, + d.lissn AS issnLinking FROM dsm_datasources d diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJobTest.java new file mode 100644 index 000000000..28e8e5abc --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJobTest.java @@ -0,0 +1,84 @@ + +package eu.dnetlib.dhp.oa.graph.merge; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Datasource; + +public class MergeGraphSparkJobTest { + + private ObjectMapper mapper; + + @BeforeEach + public void setUp() { + mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + } + + @Test + public void testMergeDatasources() throws IOException { + assertEquals( + "openaire-cris_1.1", + MergeGraphSparkJob + .mergeDatasource( + d("datasource_cris.json"), + d("datasource_UNKNOWN.json")) + .getOpenairecompatibility() + .getClassid()); + assertEquals( + "openaire-cris_1.1", + MergeGraphSparkJob + .mergeDatasource( + d("datasource_UNKNOWN.json"), + d("datasource_cris.json")) + .getOpenairecompatibility() + .getClassid()); + assertEquals( + "driver-openaire2.0", + MergeGraphSparkJob + .mergeDatasource( + d("datasource_native.json"), + d("datasource_driver-openaire2.0.json")) + .getOpenairecompatibility() + .getClassid()); + assertEquals( + "driver-openaire2.0", + MergeGraphSparkJob + .mergeDatasource( + d("datasource_driver-openaire2.0.json"), + d("datasource_native.json")) + .getOpenairecompatibility() + .getClassid()); + assertEquals( + "openaire4.0", + MergeGraphSparkJob + .mergeDatasource( + d("datasource_notCompatible.json"), + d("datasource_openaire4.0.json")) + .getOpenairecompatibility() + .getClassid()); + assertEquals( + "notCompatible", + MergeGraphSparkJob + .mergeDatasource( + d("datasource_notCompatible.json"), + d("datasource_UNKNOWN.json")) + .getOpenairecompatibility() + .getClassid()); + } + + private Optional d(String file) throws IOException { + String json = IOUtils.toString(getClass().getResourceAsStream(file)); + return Optional.of(mapper.readValue(json, Datasource.class)); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 2c10f8f58..af9fadb5a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -304,9 +304,40 @@ public class MappersTest { assertValidId(d.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertEquals(1, d.getAuthor().size()); - assertEquals(0, d.getSubject().size()); + assertEquals(1, d.getSubject().size()); assertEquals(1, d.getInstance().size()); assertEquals(1, d.getPid().size()); + assertNotNull(d.getInstance().get(0).getUrl()); + } + + @Test + void testClaimFromCrossref() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml")); + final List list = new OafToOafMapper(vocs, false).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + final Publication p = (Publication) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + System.out.println(p.getTitle().get(0).getValue()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + } + + @Test + void testODFRecord() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_record.xml")); + List list = new OdfToOafMapper(vocs, false).processMdRecord(xml); + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + final Dataset p = (Dataset) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + System.out.println(p.getTitle().get(0).getValue()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); } private void assertValidId(final String id) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 011cc18e6..f663d6095 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -80,9 +80,9 @@ public class MigrateDbEntitiesApplicationTest { assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue()); assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue()); assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName()); - assertEquals("2579-5449", ds.getJournal().getIssnPrinted()); - assertEquals("2597-6540", ds.getJournal().getIssnOnline()); - assertEquals(null, ds.getJournal().getIssnLinking()); + assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted()); + assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline()); + assertEquals(getValueAsString("issnLinking", fields), ds.getJournal().getIssnLinking()); } @Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala new file mode 100644 index 000000000..4d83057f2 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala @@ -0,0 +1,54 @@ +package eu.dnetlib.dhp.sx.graph + +import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature} +import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication +import eu.dnetlib.dhp.sx.ebi.EBIAggregator +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.junit.jupiter.api.Assertions._ +import org.junit.jupiter.api.Test + +import scala.io.Source + +class SparkScholexplorerAggregationTest { + + + @Test + def testFunderRelationshipsMapping(): Unit = { + val publications = Source.fromInputStream(getClass.getResourceAsStream("publication.json")).mkString + + var s: List[DLIPublication] = List[DLIPublication]() + + val m: ObjectMapper = new ObjectMapper() + + m.enable(SerializationFeature.INDENT_OUTPUT) + + for (line <- publications.lines) { + s = m.readValue(line, classOf[DLIPublication]) :: s + + + } + + + implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication] + val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").getOrCreate() + + + val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication] + + val unique = ds.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder)) + .groupByKey(_._1)(Encoders.STRING) + .agg(EBIAggregator.getDLIPublicationAggregator().toColumn) + .map(p => p._2) + + val uniquePubs: DLIPublication = unique.first() + + s.foreach(pp => assertFalse(pp.getAuthor.isEmpty)) + + + assertNotNull(uniquePubs.getAuthor) + assertFalse(uniquePubs.getAuthor.isEmpty) + + + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java index ed3b6efdc..ce00466df 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java @@ -2,4 +2,5 @@ package eu.dnetlib.dhp.sx.graph; public class SparkScholexplorerGraphImporterTest { + } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_UNKNOWN.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_UNKNOWN.json new file mode 100644 index 000000000..a01085c8f --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_UNKNOWN.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "UNKNOWN" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_cris.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_cris.json new file mode 100644 index 000000000..6f2b7aa7d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_cris.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire-cris_1.1" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_driver-openaire2.0.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_driver-openaire2.0.json new file mode 100644 index 000000000..d2e375f55 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_driver-openaire2.0.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "driver-openaire2.0" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_hostedby.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_hostedby.json new file mode 100644 index 000000000..03db887f5 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_hostedby.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "hostedBy" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_native.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_native.json new file mode 100644 index 000000000..7831a3fc3 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_native.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "native" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_notCompatible.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_notCompatible.json new file mode 100644 index 000000000..8dabe5d2c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_notCompatible.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "notCompatible" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire2.0.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire2.0.json new file mode 100644 index 000000000..e2db47943 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire2.0.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire2.0" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire2.0_data.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire2.0_data.json new file mode 100644 index 000000000..b8480faf0 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire2.0_data.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire2.0_data" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire3.0.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire3.0.json new file mode 100644 index 000000000..43bb0a7a4 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire3.0.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire3.0" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire4.0.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire4.0.json new file mode 100644 index 000000000..7cdba6a4e --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/merge/datasource_openaire4.0.json @@ -0,0 +1 @@ +{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire4.0" }} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json index 0f1da7095..8f8aed3a0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json @@ -228,8 +228,18 @@ "value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions" }, { - "field": "journal", + "field": "issnPrinted", "type": "string", - "value": "2579-5449 @@@ 2597-6540 @@@ " + "value": "2579-5449" + }, + { + "field": "issnOnline", + "type": "string", + "value": "2579-5448" + }, + { + "field": "issnLinking", + "type": "string", + "value": "2579-5447" } ] diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_crossref.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_crossref.xml new file mode 100644 index 000000000..8f69a5e2d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_crossref.xml @@ -0,0 +1,68 @@ + + +

+ userclaim___::7f0f7807f17db50e5c2b5c452ccaf06d + userclaim___::7f0f7807f17db50e5c2b5c452ccaf06d + 2020-08-06T07:04:09.62Z + + + + + + 2020-08-06T07:20:57.911Z + openaire____ +
+ + A case report of serious haemolysis in a glucose-6-phosphate dehydrogenase-deficient COVID-19 patient receiving hydroxychloroquine + Maillart, E. + Leemans, S. + Van Noten, H. + Vandergraesen, T. + Mahadeb, B. + Salaouatchi, M. T. + De Bels, D. + Clevenbergh, P. + + http://dx.doi.org/10.1080/23744235.2020.1774644 + + Informa UK Limited + Crossref + Infectious Diseases + Microbiology (medical) + General Immunology and Microbiology + Infectious Diseases + General Medicine + journal-article + 0001 + 2020-06-04 + + UNKNOWN + + + 10.1080/23744235.2020.1774644 + Infectious Diseases + + + + + file%3A%2F%2F%2Fsrv%2Fclaims%2Frecords%2Fpublication%2Fcrossref + + + + + + + false + false + 0.9 + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_nakala.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_nakala.xml index 105d0c413..7796e4c37 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_nakala.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_nakala.xml @@ -1,77 +1,75 @@ + xmlns:oaf="http://namespace.openaire.eu/oaf" + xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> - r3f5b9831893::cca7367159bc3ff90cd2f75bf9dc21c4 - oai:nakala.fr:hdl_11280_847e01df - 2020-08-01T00:16:24.742Z + xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance"> + r3f5b9831893::01a497c6c6b44289c52dcdf22b6c0fc0 + oai:nakala.fr:hdl_11280_50f302c6 + 2020-10-03T06:06:52.228Z r3f5b9831893 - oai:nakala.fr:hdl_11280_847e01df - 2020-06-08T01:01:38Z - hdl_11280_2b09fc10 - hdl_11280_c1bc48d0 - hdl_11280_57c8db3a - 2020-08-01T00:31:35.625Z + oai:nakala.fr:hdl_11280_50f302c6 + 2020-09-19T23:56:08Z + hdl_11280_96355742 + hdl_11280_26914437 + hdl_11280_86561837 + 2020-10-19T15:39:52.151Z - 277 - http://hdl.handle.net/11280/847e01df - - http://hdl.handle.net/http://hdl.handle.net/11280/847e01df - - http://nakala.fr/data/11280/847e01df - + http://nakala.fr/data/11280/50f302c6 + 11280/50f302c6 + - DHAAP + Desbrosse, Xavier - - CVP_Notice277-1 place du Docteur Antoine Béclère _PHO02.jpg + + Les rues Stalingrad en France (1945-2013) - - Hôpital Saint-Antoine. Fragment de dalle funéraire trouvée en décembre 1932. Paris (XIIème arr.). Photographie d'Albert Citerne (1876-1970). Plaque de verre, 1932. Département Histoire de l'Architecture et Archéologie de Paris. - Nfa_1146 - Hôpital Saint-Antoine. Fragment de dalle funéraire trouvée en décembre 1932. Paris (XIIème arr.). Photographie d'Albert Citerne (1876-1970). Plaque de verre, 1932. Département Histoire de l'Architecture et Archéologie de Paris. + + Rues – Noms -- France + + + Cette carte appartient à la collection « Guerre froide vue d’en bas » élaborée dans le cadre de l’enquête 2009-2013 du réseau des correspondants départementaux de l’IHTP « La Guerre froide vue d’en bas : 1947-1967 », enquête conduite sous la direction de Philippe Buton Professeur d’Histoire contemporaine à l’Université de Reims, d’Olivier Büttner Ingénieur de Recherche IHTP-CNRS et de Michel Hastings, Professeur de Science politique à l’Institut d’Etudes Politiques de Lille. - Nakala by Huma-Num - + IHTP-CNRS + - DHAAP, Pôle Archéologique + (CNRS), Institut d'Histoire du Temps Présent (IHTP) - Centre National de la Recherche Scientifique - - 1932 + + 2013 - StillImage - - - + Carte + + + France + + - http://hdl.handle.net/11280/847e01df + 11280/50f302c6 + 0025 - OPEN + UNKNOWN und + xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance"> - - https%3A%2F%2Fwww.nakala.fr%2Foai_oa%2F11280%2F8892ab4b - oai:nakala.fr:hdl_11280_847e01df - 2020-06-08T01:01:38Z + + https%3A%2F%2Fwww.nakala.fr%2Foai_oa%2F11280%2F92c4d30b + oai:nakala.fr:hdl_11280_50f302c6 + 2020-09-19T23:56:08Z diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_record.xml new file mode 100644 index 000000000..e2d51a43d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_record.xml @@ -0,0 +1,102 @@ + + + + r3a507cdacc5::03b31980d9bb3c4609e6005c4a3baba6 + oai:lindat.mff.cuni.cz:11372/LRT-1844 + 2020-09-04T14:36:48.411Z + r3a507cdacc5 + oai:lindat.mff.cuni.cz:11372/LRT-1844 + 2016-12-07T11:10:30Z + hdl_11858_00-097C-0000-0007-710A-A + hdl_11858_00-097C-0000-0007-710B-8 + openaire_data + 2020-09-04T14:39:16.458Z + + + + 11372/LRT-1844 + + http://hdl.handle.net/11372/LRT-1844 + + + + Hercig, Tomáš + + + Brychcín, Tomáš + + + Svoboda, Lukáš + + + Konkol, Michal + + + Steinberger, Josef + + + + Restaurant Reviews CZ ABSA corpus v2 + + University of West Bohemia, Department of Computer Science and Engineering + 2016 + + + European Commission + info:eu-repo/grantAgreement/EC/FP7/630786 + + + + 2016 + 2016-12-07T11:10:30Z + 2016-12-07T11:10:30Z + + corpus + + + + + + Restaurant Reviews CZ ABSA - 2.15k reviews with their related target and category + + The work done is described in the paper: https://doi.org/10.13053/CyS-20-3-2469 + + + 11372/LRT-1844 + 2016-12-07 + 0021 + 2016-01-01 + OPEN + http://creativecommons.org/licenses/by-nc-sa/4.0/ + und + corda_______::630786 + + + + + + + https%3A%2F%2Flindat.mff.cuni.cz%2Frepository%2Foai%2Fopenaire_data + oai:lindat.mff.cuni.cz:11372/LRT-1844 + 2016-12-07T11:10:30Z + + + + + false + false + 0.9 + + + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/publication.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/publication.json new file mode 100644 index 000000000..539dd5e62 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/publication.json @@ -0,0 +1,10 @@ +{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters...","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"} +{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters in the model. These results suggest that prediction of daily precipitation could be improved by the suggested new Bayesian-NHMM method, which can be helpful for water resources management and research on climate change.","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"} +{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters in the model. These results suggest that prediction of daily precipitation could be improved by the suggested new Bayesian-NHMM method, which can be helpful for water resources management and research on climate change.","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"} +{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters in the model. These results suggest that prediction of daily precipitation could be improved by the suggested new Bayesian-NHMM method, which can be helpful for water resources management and research on climate change.","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"} +{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters in the model. These results suggest that prediction of daily precipitation could be improved by the suggested new Bayesian-NHMM method, which can be helpful for water resources management and research on climate change.","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"} +{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-10-04T14:16:06.105Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"} +{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-09-27T11:39:38.835Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"} +{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-08-30T11:48:49.809Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"} +{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-08-14T14:25:55.176Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"} +{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-08-09T11:35:23.526Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml index 05ca7d4ce..b287e9c88 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml @@ -62,6 +62,10 @@ dhp-schemas ${project.version}
+ + org.apache.httpcomponents + httpmime + org.elasticsearch diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index abac41b89..705160a2b 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -47,6 +47,7 @@ object DLIToOAF { "References" -> ("isRelatedTo", "relationship"), "IsRelatedTo" -> ("isRelatedTo", "relationship"), "IsSupplementedBy" -> ("isSupplementedBy", "supplement"), + "Documents"-> ("isRelatedTo", "relationship"), "Cites" -> ("cites", "citation"), "Unknown" -> ("isRelatedTo", "relationship"), "IsSourceOf" -> ("isRelatedTo", "relationship"), @@ -83,7 +84,7 @@ object DLIToOAF { val rel_inverse: Map[String, String] = Map( "isRelatedTo" -> "isRelatedTo", - "IsSupplementedBy" -> "isSupplementTo", + "isSupplementedBy" -> "isSupplementTo", "cites" -> "IsCitedBy", "IsCitedBy" -> "cites", "reviews" -> "IsReviewedBy" @@ -273,29 +274,18 @@ object DLIToOAF { } -// def convertDLIRelation(r: DLIRelation): Relation = { -// -// val result = new Relation -// if (!relationTypeMapping.contains(r.getRelType)) -// return null -// -// if (r.getProperties == null || r.getProperties.size() == 0 || (r.getProperties.size() == 1 && r.getProperties.get(0) == null)) -// return null -// val t = relationTypeMapping.get(r.getRelType) -// -// result.setRelType("resultResult") -// result.setRelClass(t.get._1) -// result.setSubRelType(t.get._2) -// result.setCollectedfrom(r.getProperties.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava) -// result.setSource(generateId(r.getSource)) -// result.setTarget(generateId(r.getTarget)) -// -// if (result.getSource.equals(result.getTarget)) -// return null -// result.setDataInfo(generateDataInfo()) -// -// result -// } + def convertDLIRelation(r: Relation): Relation = { + + val rt = r.getRelType + if (!relationTypeMapping.contains(rt)) + return null + r.setRelType("resultResult") + r.setRelClass(relationTypeMapping(rt)._1) + r.setSubRelType(relationTypeMapping(rt)._2) + r.setSource(generateId(r.getSource)) + r.setTarget(generateId(r.getTarget)) + r + } def convertDLIDatasetTOOAF(d: DLIDataset): Dataset = { diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala index 165c3340b..f1e374f95 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala @@ -15,11 +15,13 @@ import org.apache.spark.{SparkConf, SparkContext} import org.codehaus.jackson.map.ObjectMapper import scala.collection.mutable.ArrayBuffer - +import scala.collection.JavaConverters._ object SparkExportContentForOpenAire { + + def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExportContentForOpenAire.getClass.getResourceAsStream("input_export_content_parameters.json"))) @@ -32,51 +34,54 @@ object SparkExportContentForOpenAire { .master(parser.get("master")).getOrCreate() - val sc:SparkContext = spark.sparkContext - val workingPath = parser.get("workingDirPath") + implicit val dliPubEncoder: Encoder[DLIPublication] = Encoders.kryo(classOf[DLIPublication]) + implicit val dliDatEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset]) implicit val pubEncoder: Encoder[Publication] = Encoders.bean(classOf[Publication]) implicit val datEncoder: Encoder[OafDataset] = Encoders.bean(classOf[OafDataset]) implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation]) import spark.implicits._ + val dsRel = spark.read.load(s"$workingPath/relation_b").as[Relation] + dsRel.filter(r => r.getDataInfo==null || r.getDataInfo.getDeletedbyinference ==false) + .map(DLIToOAF.convertDLIRelation) + .filter(r => r!= null) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS") - val relRDD:RDD[Relation] = sc.textFile(s"$workingPath/relation_j") - .map(s => new ObjectMapper().readValue(s, classOf[Relation])) - .filter(p => p.getDataInfo.getDeletedbyinference == false) - spark.createDataset(relRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS") - val datRDD:RDD[OafDataset] = sc.textFile(s"$workingPath/dataset") - .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) + val dsPubs = spark.read.load(s"$workingPath/publication").as[DLIPublication] + dsPubs + .filter(p=>p.getDataInfo.getDeletedbyinference == false) + .map(DLIToOAF.convertDLIPublicationToOAF) + .filter(p=>p!= null) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationDS") + + + val dsDataset = spark.read.load(s"$workingPath/dataset").as[DLIDataset] + dsDataset .filter(p => p.getDataInfo.getDeletedbyinference == false) .map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null) - spark.createDataset(datRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS") - - - val pubRDD:RDD[Publication] = sc.textFile(s"$workingPath/publication") - .map(s => new ObjectMapper().readValue(s, classOf[DLIPublication])) - .filter(p => p.getDataInfo.getDeletedbyinference == false) - .map(DLIToOAF.convertDLIPublicationToOAF).filter(p=>p!= null) - spark.createDataset(pubRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS") + .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetDS") - val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS").as[Publication] - val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/datasetDS").as[OafDataset] - val relDS1 :Dataset[Relation] = spark.read.load(s"$workingPath/relationDS").as[Relation] + + val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/export/publicationDS").as[Publication] + val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/export/datasetDS").as[OafDataset] + val relDS1 :Dataset[Relation] = spark.read.load(s"$workingPath/export/relationDS").as[Relation] val pub_id = pubs.select("id").distinct() val dat_id = dats.select("id").distinct() - pub_id.joinWith(relDS1, pub_id("id").equalTo(relDS1("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_f1") + pub_id.joinWith(relDS1, pub_id("id").equalTo(relDS1("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS_f1") - val relDS2= spark.read.load(s"$workingPath/relationDS_f1").as[Relation] + val relDS2= spark.read.load(s"$workingPath/export/relationDS_f1").as[Relation] - relDS2.joinWith(dat_id, relDS2("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_filtered") + relDS2.joinWith(dat_id, relDS2("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationDS_filtered") val r_source = relDS2.select(relDS2("source")).distinct() @@ -87,22 +92,20 @@ object SparkExportContentForOpenAire { pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1) .withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row") - .write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS_filtered") + .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationDS_filtered") dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1) .withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row") - .write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS") + .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetAS") - spark.createDataset(sc.textFile(s"$workingPath/dataset") - .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) - .map(DLIToOAF.convertDLIDatasetToExternalReference) - .filter(p => p != null)).as[DLIExternalReference].write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference") - val pf = spark.read.load(s"$workingPath/publicationDS_filtered").select("id") - val relDS3 = spark.read.load(s"$workingPath/relationDS").as[Relation] + dsDataset.map(DLIToOAF.convertDLIDatasetToExternalReference).filter(p => p != null).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/externalReference") + + val pf = spark.read.load(s"$workingPath/export/publicationDS_filtered").select("id") + val relDS3 = spark.read.load(s"$workingPath/export/relationDS").as[Relation] val relationTo = pf.joinWith(relDS3, pf("id").equalTo(relDS3("source")),"inner").map(t =>t._2) - val extRef = spark.read.load(s"$workingPath/externalReference").as[DLIExternalReference] + val extRef = spark.read.load(s"$workingPath/export/externalReference").as[DLIExternalReference] spark.createDataset(relationTo.joinWith(extRef, relationTo("target").equalTo(extRef("id")), "inner").map(d => { val r = d._1 @@ -112,11 +115,11 @@ object SparkExportContentForOpenAire { var dli_ext = ArrayBuffer[DLIExternalReference]() f._2.foreach(d => if (dli_ext.size < 100) dli_ext += d ) (f._1, dli_ext) - })).write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference_grouped") + })).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/externalReference_grouped") - val pubf :Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS_filtered").as[Publication] + val pubf :Dataset[Publication] = spark.read.load(s"$workingPath/export/publicationDS_filtered").as[Publication] - val groupedERf:Dataset[(String, List[DLIExternalReference])]= spark.read.load(s"$workingPath/externalReference_grouped").as[(String, List[DLIExternalReference])] + val groupedERf:Dataset[(String, List[DLIExternalReference])]= spark.read.load(s"$workingPath/export/externalReference_grouped").as[(String, List[DLIExternalReference])] groupedERf.joinWith(pubf,pubf("id").equalTo(groupedERf("_1"))).map(t => { @@ -128,29 +131,28 @@ object SparkExportContentForOpenAire { } else publication } - ).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS") + ).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationAS") - spark.createDataset(sc.textFile(s"$workingPath/dataset") - .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) + dsDataset .map(DLIToOAF.convertClinicalTrial) - .filter(p => p != null)) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrials") + .filter(p => p != null) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/export/clinicalTrials") - val ct:Dataset[(String,String)] = spark.read.load(s"$workingPath/clinicalTrials").as[(String,String)] + val ct:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/clinicalTrials").as[(String,String)] - val relDS= spark.read.load(s"$workingPath/relationDS_f1").as[Relation] + val relDS= spark.read.load(s"$workingPath/export/relationDS_f1").as[Relation] relDS.joinWith(ct, relDS("target").equalTo(ct("_1")), "inner") .map(k =>{ val currentRel = k._1 currentRel.setTarget(k._2._2) currentRel - }).write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrialsRels") + }).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/clinicalTrialsRels") - val clRels:Dataset[Relation] = spark.read.load(s"$workingPath/clinicalTrialsRels").as[Relation] - val rels:Dataset[Relation] = spark.read.load(s"$workingPath/relationDS_filtered").as[Relation] + val clRels:Dataset[Relation] = spark.read.load(s"$workingPath/export/clinicalTrialsRels").as[Relation] + val rels:Dataset[Relation] = spark.read.load(s"$workingPath/export/relationDS_filtered").as[Relation] rels.union(clRels).flatMap(r => { val inverseRel = new Relation @@ -162,18 +164,18 @@ object SparkExportContentForOpenAire { inverseRel.setSubRelType(r.getSubRelType) inverseRel.setRelClass(DLIToOAF.rel_inverse(r.getRelClass)) List(r, inverseRel) - }).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationAS") + }).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/relationAS") - spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.fixInstance).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS_fixed") - spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.fixInstanceDataset).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS_fixed") + spark.read.load(s"$workingPath/export/publicationAS").as[Publication].map(DLIToOAF.fixInstance).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/publicationAS_fixed") + spark.read.load(s"$workingPath/export/datasetAS").as[OafDataset].map(DLIToOAF.fixInstanceDataset).write.mode(SaveMode.Overwrite).save(s"$workingPath/export/datasetAS_fixed") - val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet) - val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS_fixed").as[Publication].map(DLIToOAF.toActionSet) - val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS_fixed").as[OafDataset].map(DLIToOAF.toActionSet) + val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/relationAS").as[Relation].map(DLIToOAF.toActionSet) + val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/publicationAS_fixed").as[Publication].map(DLIToOAF.toActionSet) + val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/export/datasetAS_fixed").as[OafDataset].map(DLIToOAF.toActionSet) - fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) + fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/export/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java new file mode 100644 index 000000000..95bea74a2 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java @@ -0,0 +1,111 @@ + +package eu.dnetlib.dhp.export.zenodo; + +import java.io.*; + +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class MakeTar implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(MakeTar.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + MakeTar.class + .getResourceAsStream( + "/eu/dnetlib/dhp/export/input_maketar_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + final String outputPath = parser.get("targetPath"); + log.info("hdfsPath: {}", outputPath); + + final String hdfsNameNode = parser.get("nameNode"); + log.info("nameNode: {}", hdfsNameNode); + + final String inputPath = parser.get("sourcePath"); + log.info("input path : {}", inputPath); + + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + + FileSystem fileSystem = FileSystem.get(conf); + + makeTArArchive(fileSystem, inputPath, outputPath); + + } + + public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException { + + RemoteIterator dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath)); + + while (dir_iterator.hasNext()) { + LocatedFileStatus fileStatus = dir_iterator.next(); + + Path p = fileStatus.getPath(); + String p_string = p.toString(); + String entity = p_string.substring(p_string.lastIndexOf("/") + 1); + + write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity); + } + + } + + private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name) + throws IOException { + + Path hdfsWritePath = new Path(outputPath); + FSDataOutputStream fsDataOutputStream = null; + if (fileSystem.exists(hdfsWritePath)) { + fileSystem.delete(hdfsWritePath, true); + + } + fsDataOutputStream = fileSystem.create(hdfsWritePath); + + TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream()); + + RemoteIterator fileStatusListIterator = fileSystem + .listFiles( + new Path(inputPath), true); + + while (fileStatusListIterator.hasNext()) { + LocatedFileStatus fileStatus = fileStatusListIterator.next(); + + Path p = fileStatus.getPath(); + String p_string = p.toString(); + if (!p_string.endsWith("_SUCCESS")) { + String name = p_string.substring(p_string.lastIndexOf("/") + 1); + TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz"); + entry.setSize(fileStatus.getLen()); + ar.putArchiveEntry(entry); + + InputStream is = fileSystem.open(fileStatus.getPath()); + + BufferedInputStream bis = new BufferedInputStream(is); + + int count; + byte data[] = new byte[1024]; + while ((count = bis.read(data, 0, data.length)) != -1) { + ar.write(data, 0, count); + } + bis.close(); + ar.closeArchiveEntry(); + + } + + } + + ar.close(); + } + +} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java new file mode 100644 index 000000000..1dcbf6ccc --- /dev/null +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.export.zenodo; + +import java.io.Serializable; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.api.MissingConceptDoiException; +import eu.dnetlib.dhp.common.api.ZenodoAPIClient; + +public class SendToZenodoHDFS implements Serializable { + + private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class); + + public static void main(final String[] args) throws Exception, MissingConceptDoiException { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SendToZenodoHDFS.class + .getResourceAsStream( + "/eu/dnetlib/dhp/export/upload_zenodo.json"))); + + parser.parseArgument(args); + + final String hdfsPath = parser.get("hdfsPath"); + final String hdfsNameNode = parser.get("nameNode"); + final String access_token = parser.get("accessToken"); + final String connection_url = parser.get("connectionUrl"); + final String metadata = parser.get("metadata"); + final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition")); + final String concept_rec_id = Optional + .ofNullable(parser.get("conceptRecordId")) + .orElse(null); + + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + + FileSystem fileSystem = FileSystem.get(conf); + + RemoteIterator fileStatusListIterator = fileSystem + .listFiles( + new Path(hdfsPath), true); + ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token); + if (newDeposition) { + zenodoApiClient.newDeposition(); + } else { + if (concept_rec_id == null) { + throw new MissingConceptDoiException("No concept record id has been provided"); + } + zenodoApiClient.newVersion(concept_rec_id); + } + + while (fileStatusListIterator.hasNext()) { + LocatedFileStatus fileStatus = fileStatusListIterator.next(); + + Path p = fileStatus.getPath(); + String p_string = p.toString(); + if (!p_string.endsWith("_SUCCESS")) { + // String tmp = p_string.substring(0, p_string.lastIndexOf("/")); + String name = p_string.substring(p_string.lastIndexOf("/") + 1); + log.info("Sending information for community: " + name); + FSDataInputStream inputStream = fileSystem.open(p); + zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen()); + + } + + } + + zenodoApiClient.sendMretadata(metadata); + zenodoApiClient.publish(); + + } + +} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json new file mode 100644 index 000000000..6d90ced2c --- /dev/null +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "n", + "paramLongName": "nameNode", + "paramDescription": "the Name Node", + "paramRequired": true + }, + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source path", + "paramRequired": true + }, + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target path", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/upload_zenodo.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/upload_zenodo.json new file mode 100644 index 000000000..66676005e --- /dev/null +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/upload_zenodo.json @@ -0,0 +1,45 @@ + +[ + { + "paramName":"nd", + "paramLongName":"newDeposition", + "paramDescription": "if it is a new deposition (true) or a new version (false)", + "paramRequired": true + }, + { + "paramName":"cri", + "paramLongName":"conceptRecordId", + "paramDescription": "The id of the concept record for a new version", + "paramRequired": false + }, + { + "paramName":"hdfsp", + "paramLongName":"hdfsPath", + "paramDescription": "the path of the folder tofind files to send to Zenodo", + "paramRequired": true + }, + { + "paramName": "nn", + "paramLongName": "nameNode", + "paramDescription": "the name node", + "paramRequired": true + }, + { + "paramName": "at", + "paramLongName": "accessToken", + "paramDescription": "the access token for the deposition", + "paramRequired": false + }, + { + "paramName":"cu", + "paramLongName":"connectionUrl", + "paramDescription": "the url to connect to deposit", + "paramRequired": false + }, + { + "paramName":"m", + "paramLongName":"metadata", + "paramDescription": "metadata associated to the deposition", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/config-default.xml new file mode 100644 index 000000000..3b9aaca2a --- /dev/null +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/config-default.xml @@ -0,0 +1,48 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + oozie.wf.rerun.failnodes + false + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + "com.cloudera.spark.lineage.NavigatorAppListener" + + + spark2SqlQueryExecutionListeners + "com.cloudera.spark.lineage.NavigatorQueryListener" + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml new file mode 100644 index 000000000..6d7056503 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml @@ -0,0 +1,53 @@ + + + + sourcePath + the source path + + + targetPath + the target path + + + metadata + the metadata + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.export.zenodo.MakeTar + -t${targetPath} + -n${nameNode} + -s${sourcePath} + + + + + + + + + eu.dnetlib.dhp.export.zenodo.SendToZenodoHDFS + --hdfsPath/user/dnet.scholexplorer/scholix/provision/scholix.tar/scholix-2020-10-16.tar + --nameNode${nameNode} + --accessTokenb6ddrY6b77WxcDEevn9gqVE5sL5sDNjdUijt75W3o7cQo5vpFFI48dMiu8Gv + --connectionUrlhttps://zenodo.org/api/deposit/depositions + --metadata${metadata} + --conceptRecordId1200252 + --newDepositionfalse + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala index 0bd746cff..cb04cf9e9 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala @@ -5,9 +5,7 @@ import java.time.format.DateTimeFormatter import eu.dnetlib.dhp.schema.oaf.Relation import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication} -import org.apache.spark.SparkConf -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession + import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} import org.junit.jupiter.api.Test @@ -23,6 +21,19 @@ class ExportDLITOOAFTest { } + + @Test + def testMappingRele():Unit = { + + val r:Relation = new Relation + r.setSource("60|fbff1d424e045eecf24151a5fe3aa738") + r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877") + + val r1 =DLIToOAF.convertDLIRelation(r) + println(r1.getSource, r1.getTarget) + + } + @Test def testPublicationMapping():Unit = { diff --git a/pom.xml b/pom.xml index 52edd497f..03c69108d 100644 --- a/pom.xml +++ b/pom.xml @@ -328,7 +328,7 @@ eu.dnetlib dnet-pace-core - 4.0.4 + 4.0.5 eu.dnetlib @@ -431,6 +431,21 @@ ${common.compress.version} + + + org.apache.commons + commons-csv + ${common.csv.version} + + + + + + org.apache.poi + poi-ooxml + ${apache.poi.version} + + org.json4s json4s-jackson_2.11 @@ -666,8 +681,10 @@ 3.1.1 7.5.0 4.7.2 - 1.1 + 1.20 3.5.3 4.13.0 + 1.8 + 4.1.2