changed the mapping and added new resources for testing

This commit is contained in:
Miriam Baglioni 2020-06-23 15:30:34 +02:00
parent d6838e18e6
commit 563378ce3f
5 changed files with 302 additions and 179 deletions

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.oa.graph.dump.gcat;
import java.io.Serializable;
import java.io.StringWriter;
import java.util.*;
import java.util.stream.Collectors;
@ -10,7 +11,10 @@ import eu.dnetlib.dhp.schema.dump.gcat.CatalogueEntry;
import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import org.apache.avro.generic.GenericData;
import org.apache.commons.lang3.StringUtils;
public class Mapper implements Serializable {
@ -22,26 +26,31 @@ public class Mapper implements Serializable {
Set<String> urlSet = new HashSet<>();
Set<String> cfSet = new HashSet<>();
Set<String> hbSet = new HashSet<>();
Set<String> countrySet = new HashSet<>();
if (ort.isPresent()) {
switch (ort.get().getClassid()) {
case "publication":
Optional<Journal> journal = Optional
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
if (journal.isPresent()) {
Journal j = journal.get();
KeyValue kv = new KeyValue();
kv.setKey("journal");
kv.setValue(j.getName() + ", " + j.getVol() + ", " + j.getIss());
externals.add(kv);
}
externals.add(KeyValue.newInstance("Journal" , Optional
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal())
.map(value -> value.getName() +", " + value.getVol() + ", " + value.getIss())
.orElse("")));
// if (journal.isPresent()) {
// Journal j = journal.get();
// KeyValue kv = new KeyValue();
// kv.setKey("Journal");
// kv.setValue(j.getName() + ", " + j.getVol() + ", " + j.getIss());
// externals.add(kv);
// }
out.setUrl(Constants.PUBLICATION_URL + input.getId().substring(3));
externals.add(KeyValue.newInstance("result type", "publication"));
externals.add(KeyValue.newInstance("Result Type", "publication"));
break;
case "dataset":
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
out.setVersion(Optional.ofNullable(id.getVersion())
.map(v -> v.getValue())
.orElse(""));
out.setUrl(Constants.DATASET_URL + input.getId().substring(3));
externals.add(KeyValue.newInstance("result type", "dataset"));
externals.add(KeyValue.newInstance("Result Type", "dataset"));
break;
case "software":
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
@ -52,34 +61,42 @@ public class Mapper implements Serializable {
.ofNullable(is.getDocumentationUrl())
.ifPresent(value -> value.forEach(v -> urlSet.add(v.getValue())));
Optional
externals.add(KeyValue.newInstance("Programming Language",Optional
.ofNullable(is.getProgrammingLanguage())
.ifPresent(
value -> externals.add(KeyValue.newInstance("programming language", value.getClassname())));
.map(v -> v.getClassname())
.orElse("")));
// .ifPresent(
// value -> externals.add(KeyValue.newInstance("Programming Language", value.getClassname())));
out.setUrl(Constants.SOFTWARE_URL + input.getId().substring(3));
externals.add(KeyValue.newInstance("result type", "software"));
externals.add(KeyValue.newInstance("Result Type", "software"));
break;
case "other":
out.setUrl(Constants.ORP_URL + input.getId().substring(3));
externals.add(KeyValue.newInstance("result type", "other"));
externals.add(KeyValue.newInstance("Result Type", "other"));
break;
}
out.setLicense_id(Constants.DEFAULT_LICENCE_ID);
Optional
.ofNullable(input.getAuthor())
.ifPresent(
value -> value
.forEach(v -> externals.add(KeyValue.newInstance("author", v.getFullname()))));
Optional<List<Author>> oauth = Optional
.ofNullable(input.getAuthor());
List<String> authList = new ArrayList<>();
if(oauth.isPresent()){
authList = oauth.get().stream().map(v -> v.getFullname()).collect(Collectors.toList());
}
if (authList.size() == 0){
externals.add(KeyValue.newInstance("Author", ""));
}else{
authList.forEach(a -> externals.add(KeyValue.newInstance("Author", a)));
}
Optional
externals.add(KeyValue.newInstance("Access Right", Optional
.ofNullable(input.getBestaccessright())
.ifPresent(
value -> externals
.add(KeyValue.newInstance("access right", Constants.gcatCatalogue.get(value.getClassid()))));
.map(
value -> value.getClassname())
.orElse("")));
Optional
.ofNullable(input.getCollectedfrom())
@ -87,37 +104,41 @@ public class Mapper implements Serializable {
value -> value
.forEach(v -> cfSet.add(v.getValue())));
Optional
.ofNullable(input.getContributor())
.ifPresent(
value -> value
.forEach(v -> externals.add(KeyValue.newInstance("contributor", v.getValue()))));
Optional<List<Field<String>>> ocont = Optional
.ofNullable(input.getContributor());
if(ocont.isPresent()){
ocont.get().forEach(v -> externals.add(KeyValue.newInstance("Contributor", v.getValue())));
}else{
externals.add(KeyValue.newInstance("Contributor",""));
}
Optional
.ofNullable(input.getCountry())
.ifPresent(
value -> value
.forEach(v -> externals.add(KeyValue.newInstance("country", v.getClassname()))));
.forEach(v -> countrySet.add(v.getClassname())));
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value ->
getDescription(out, externals, value));
Optional
externals.add(KeyValue.newInstance("Embargo End Date", Optional
.ofNullable(input.getEmbargoenddate())
.ifPresent(oStr -> externals.add(KeyValue.newInstance("embargo end date", oStr.getValue())));
.map(value -> value.getValue())
.orElse("")));
//.ifPresent(oStr -> externals.add(KeyValue.newInstance("Embargo End Date", oStr.getValue())));
final List<String> formatList = new ArrayList<>();
final Set<String> formatSet = new HashSet<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.forEach(f -> formatList.add(f.getValue())));
.ifPresent(value -> value.forEach(f -> formatSet.add(f.getValue())));
String id = input.getId();
String id = input.getId().toLowerCase();
out.setName(id.substring(id.indexOf('|') + 1).replace(":", "-"));
final Set<String> itSet = new HashSet<>();
Optional
.ofNullable(input.getInstance())
.ifPresent(
@ -132,11 +153,16 @@ public class Mapper implements Serializable {
.ofNullable(v.getUrl())
.ifPresent(u -> u.forEach(url -> urlSet.add(url)));
Optional.ofNullable(v.getInstancetype())
.ifPresent(it -> itSet.add(it.getClassname()));
}));
Optional
externals.add(KeyValue.newInstance("Language", Optional
.ofNullable(input.getLanguage())
.ifPresent(value -> externals.add(KeyValue.newInstance("language", value.getClassname())));
.map(value -> value.getClassname())
.orElse("")));
//.ifPresent(value -> externals.add(KeyValue.newInstance("Language", value.getClassname())));
List<StructuredProperty> iTitle = Optional
.ofNullable(input.getTitle())
@ -149,44 +175,71 @@ public class Mapper implements Serializable {
if (iTitle.size() > 0) {
out.setTitle(iTitle.get(0).getValue());
}else{
out.setTitle("");
}
final Set<String> pidSet = new HashSet<>();
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> value
.forEach(
v -> externals
.add(KeyValue.newInstance("pid", v.getQualifier().getClassid() + ":" + v.getValue()))));
v -> pidSet.add(v.getQualifier().getClassid() + ":" + v.getValue())));
Optional
externals.add(KeyValue.newInstance("Publication Date", Optional
.ofNullable(input.getDateofacceptance())
.ifPresent(value -> externals.add(KeyValue.newInstance("publication date", value.getValue())));
.map(value -> value.getValue())
.orElse("")));
//.ifPresent(value -> externals.add(KeyValue.newInstance("Publication Date", value.getValue())));
Optional
externals.add(KeyValue.newInstance("Publisher", Optional
.ofNullable(input.getPublisher())
.ifPresent(value -> externals.add(KeyValue.newInstance("publisher", value.getValue())));
.map(value -> value.getValue())
.orElse("")));
//.ifPresent(value -> externals.add(KeyValue.newInstance("Publisher", value.getValue())));
List<ControlledField> subjectList = new ArrayList<>();
Set<String> kwSet = new HashSet<>();
List<String> sbjList = new ArrayList<>();
Optional
.ofNullable(input.getSubject())
.ifPresent(
value -> value
.stream()
.forEach(
s -> externals
.add(
KeyValue
.newInstance("subject", s.getQualifier().getClassid() + ":" + s.getValue()))));
s -> {
String classId = s.getQualifier().getClassid();
if (!classId.equals("keyword") &&
StringUtils.isNotEmpty(classId)){
sbjList.add(classId + ":" + s.getValue());
Optional
.ofNullable(input.getResourcetype())
.ifPresent(value -> externals.add(KeyValue.newInstance("resource type", value.getClassname())));
}else{
kwSet.add(s.getValue());
}
cfSet.forEach(cf -> externals.add(KeyValue.newInstance("collected from", cf)));
hbSet.forEach(hb -> externals.add(KeyValue.newInstance("hosted by", hb)));
urlSet.forEach(url -> externals.add(KeyValue.newInstance("url", url)));
}));
if(sbjList.size() == 0){
externals
.add(
KeyValue
.newInstance("Subject", ""));
}else{
sbjList.forEach(s -> externals
.add(
KeyValue
.newInstance("Subject",s )));
}
cfSet.remove("Unknown Repository");
externals.add(KeyValue.newInstance("Collected From", getListOfValues(cfSet)));
hbSet.remove("Unknown Repository");
externals.add(KeyValue.newInstance("Hosted By", getListOfValues(hbSet)));
externals.add(KeyValue.newInstance("URL", getListOfValues(urlSet)));
externals.add(KeyValue.newInstance("Country", getListOfValues(countrySet)));
externals.add(KeyValue.newInstance("Format", getListOfValues(formatSet)));
externals.add(KeyValue.newInstance("PID", getListOfValues(pidSet)));
externals.add(KeyValue.newInstance("Resource Type", getListOfValues(itSet)));
externals.add(KeyValue.newInstance("Keyword", getListOfValues(kwSet)));
out.setExtras(externals);
}
@ -194,12 +247,22 @@ public class Mapper implements Serializable {
return out;
}
private static String getListOfValues(Set<String> cfSet) {
StringWriter sw = new StringWriter();
cfSet.forEach(value -> sw.append(value + "; "));
return sw.toString().length() > 0 ? sw.toString().substring(0, sw.toString().length() - 2) : "";
}
private static void getDescription(CatalogueEntry out, List<KeyValue> externals, List<Field<String>> value) {
Iterator<Field<String>> it = value.iterator();
if (it.hasNext()) {
out.setNotes(it.next().getValue());
}
it.forEachRemaining(v -> externals.add(KeyValue.newInstance("description", v.getValue())));
else{
out.setNotes("");
}
it.forEachRemaining(v -> externals.add(KeyValue.newInstance("Description", v.getValue())));
}
}

View File

@ -94,37 +94,9 @@ public class DumpJobTest {
org.apache.spark.sql.Dataset<eu.dnetlib.dhp.schema.dump.gcat.CatalogueEntry> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.dump.gcat.CatalogueEntry.class));
Assertions.assertEquals(90, verificationDataset.count());
Assertions.assertEquals(3, verificationDataset.count());
// verificationDataset.show(false);
Assertions
.assertTrue(
verificationDataset.filter("bestAccessright.code = 'c_abf2'").count() == verificationDataset
.filter("bestAccessright.code = 'c_abf2' and bestAccessright.label = 'OPEN'")
.count());
Assertions
.assertTrue(
verificationDataset.filter("bestAccessright.code = 'c_16ec'").count() == verificationDataset
.filter("bestAccessright.code = 'c_16ec' and bestAccessright.label = 'RESTRICTED'")
.count());
Assertions
.assertTrue(
verificationDataset.filter("bestAccessright.code = 'c_14cb'").count() == verificationDataset
.filter("bestAccessright.code = 'c_14cb' and bestAccessright.label = 'CLOSED'")
.count());
Assertions
.assertTrue(
verificationDataset.filter("bestAccessright.code = 'c_f1cf'").count() == verificationDataset
.filter("bestAccessright.code = 'c_f1cf' and bestAccessright.label = 'EMBARGO'")
.count());
Assertions.assertTrue(verificationDataset.filter("size(context) > 0").count() == 90);
Assertions.assertTrue(verificationDataset.filter("type = 'dataset'").count() == 90);
// verificationDataset.select("instance.type").show(false);
//TODO verify value and name of the fields for vocab related value (i.e. accessright, bestaccessright)

View File

@ -17,7 +17,7 @@ import eu.dnetlib.dhp.oa.graph.dump.gcat.GCatAPIClient;
* NEVER EVER ENABLE THIS CLASS UNLESS YOU ABSOLUTELY KNOW WHAT YOU ARE DOING: with the proper parameters set it can
* dropped a D4Science Catalogue
*/
@Disabled
//@Disabled
public class GCatAPIClientTest {
private static GCatAPIClient client;
@ -25,8 +25,8 @@ public class GCatAPIClientTest {
@BeforeAll
public static void setup() {
client = new GCatAPIClient();
client.setApplicationToken("");
client.setGcatBaseURL("");
client.setApplicationToken("816486a3-60a9-4ecc-a7e0-a96740a90207-843339462");
client.setGcatBaseURL("https://gcat.d4science.org/gcat/");
}
@Test
@ -39,9 +39,9 @@ public class GCatAPIClientTest {
// The 'name' must be between 2 and 100 characters long and contain only lowercase alphanumeric characters, '-'
// and '_'.
// You can validate your name using the regular expression : ^[a-z0-9_\\-]{2,100}$
String objidentifier = "nstest::test";
String objidentifier = "fake";
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dump/gcat/gcat_pub.json"));
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dump/gcat/gcat_dat.json"));
System.out.println("Creating item...");
Assertions.assertEquals(HttpStatus.SC_CREATED, client.publish(json));
System.out.println("item created, now listing...");
@ -51,6 +51,28 @@ public class GCatAPIClientTest {
System.out.println("item purged");
}
@Test
public void testPublish() throws IOException, URISyntaxException {
// The 'name' must be between 2 and 100 characters long and contain only lowercase alphanumeric characters, '-'
// and '_'.
// You can validate your name using the regular expression : ^[a-z0-9_\\-]{2,100}$
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dump/gcat/gcat_software.json"));
System.out.println("Creating item...");
Assertions.assertEquals(HttpStatus.SC_CREATED, client.publish(json));
System.out.println("item created, now listing...");
//Assertions.assertEquals(1, client.list(0, 10).size());
}
@Test
public void purgeItem() throws IOException, URISyntaxException {
String objidentifier = "__biotools__--c17ebaca97f287b181090c9b4cba766e";
Assertions.assertTrue(client.purge(objidentifier));
System.out.println("item purged");
}
@Test
public void testPurgeUnexisting() throws IOException, URISyntaxException {
String id = "1234";

View File

@ -1,91 +1,60 @@
{
"extras": [
{
"key": "result type",
"value": "dataset"
},
{
"key": "author",
"value": "Philippe Laredo"
},
{
"key": "access right",
"value": "OPEN"
},
{
"key": "contributor",
"value": "European Commission"
},
{
"key": "language",
"value": "English"
},
{
"key": "pid",
"value": "doi:10.5281/zenodo.2560116"
},
{
"key": "pid",
"value": "doi:10.5281/zenodo.2560117"
},
{
"key": "publication date",
"value": "2019-02-08"
},
{
"key": "publisher",
"value": "Zenodo"
},
{
"key": "resource type",
"value": "Unknown"
},
{
"key": "collected from",
"value": "ZENODO"
},
{
"key": "collected from",
"value": "Datacite"
},
{
"key": "collected from",
"value": "figshare"
},
{
"key": "hosted by",
"value": "Zenodo"
},
{
"key": "hosted by",
"value": "ZENODO"
},
{
"key": "hosted by",
"value": "figshare"
},
{
"key": "url",
"value": "http://dx.doi.org/10.5281/zenodo.2560117"
},
{
"key": "url",
"value": "https://zenodo.org/record/2560117"
},
{
"key": "url",
"value": "http://dx.doi.org/10.5281/zenodo.2560116"
},
{
"key": "url",
"value": "https://figshare.com/articles/Introduction_of_RISIS_project_by_Philippe_Laredo/7699286"
}
],
"license_id": "notspecified",
"name": "dedup_wf_001--10160b3eafcedeb0a384fc400fe1c3fa",
{ "license_id": "notspecified",
"name": "fake",
"notes": "<p>Introduction of RISIS project by Philippe Laredo<\/p>",
"title": "Introduction of RISIS project by Philippe Laredo",
"url": "https://beta.risis.openaire.eu/search/dataset?datasetId=dedup_wf_001::10160b3eafcedeb0a384fc400fe1c3fa",
"version": "None",
"private": false
"extras": [
{
"key": "Result Type",
"value": "dataset"
},
{
"key": "Access Right",
"value": "OPEN"
},{
"key": "Author",
"value": "Philippe Laredo"
},
{
"key": "Contributor",
"value": "European Commission"
},
{
"key": "Language",
"value": "English"
},
{
"key": "PID",
"value": "doi:10.5281/zenodo.2560116"
},
{
"key": "PID",
"value": "doi:10.5281/zenodo.2560117"
},
{
"key": "Publication Date",
"value": "2019-02-08"
},
{
"key": "Publisher",
"value": "Zenodo"
},
{
"key": "Resource Type",
"value": "Unknown"
},
{
"key": "Collected From",
"value": "ZENODO; Datacite; figshare"
},
{
"key": "Hosted By",
"value": "Zenodo; ZENODO; figshare"
},
{
"key": "URL",
"value": "http://dx.doi.org/10.5281/zenodo.2560117"
}
]
}

View File

@ -0,0 +1,97 @@
{
"extras": [
{
"key": "Programming Language",
"value": "UNKNOWN"
},
{
"key": "Result Type",
"value": "software"
},
{
"key": "Author",
"value": "Regev, Mor"
},
{
"key": "Author",
"value": "Simony, Erez"
},
{
"key": "Author",
"value": "Lee, Katherine"
},
{
"key": "Author",
"value": "Tan, Kean Ming"
},
{
"key": "Author",
"value": "Chen, Janice"
},
{
"key": "Author",
"value": "Hasson, Uri"
},
{
"key": "Access Right",
"value": "not available"
},
{
"key": "Contributor",
"value": "Regev, Mor"
},
{
"key": "Embargo End Date",
"value": ""
},
{
"key": "Language",
"value": "en-us"
},
{
"key": "Publication Date",
"value": "2018-01-01"
},
{
"key": "Publisher",
"value": "Code Ocean"
},
{
"key": "Collected From",
"value": "Datacite"
},
{
"key": "Hosted By",
"value": "Code Ocean"
},
{
"key": "URL",
"value": "http://dx.doi.org/10.24433/co.12957bc5-fa2b-488f-ae72-52e3fe362b5c; fake; https://codeocean.com/2018/10/30/intersubject-functional-correlation-lpar-isfc-rpar-as-a-function-of-attention"
},
{
"key": "Country",
"value": ""
},
{
"key": "Format",
"value": ""
},
{
"key": "PID",
"value": "doi:10.24433/co.12957bc5-fa2b-488f-ae72-52e3fe362b5c"
},
{
"key": "Resource Type",
"value": "Software"
},
{
"key": "Keyword",
"value": "neuroscience; attention; Capsule; Biology; language; fmri"
}
],
"license_id": "notspecified",
"name": "datacite____--6b1e3a2fa60ed8c27317a66d6357f795",
"notes": "This capsule demonstrates the inter-subject functional correlation (ISFC) analysis described in \"Propagation of information along the cortical hierarchy as a function of attention while reading and listening to stories \" by Regev, Simony, Lee, Tan, Chen and Hasson.",
"title": "Intersubject functional correlation (ISFC) as a function of attention",
"url": "https://beta.risis.openaire.eu/search/software?softwareId=datacite____::6b1e3a2fa60ed8c27317a66d6357f795"
}