This commit is contained in:
Miriam Baglioni 2020-06-22 19:14:06 +02:00
parent 1566fd590e
commit 3da12be81f
5 changed files with 29 additions and 42 deletions

View File

@ -35,11 +35,13 @@ public class Mapper implements Serializable {
externals.add(kv);
}
out.setUrl(Constants.PUBLICATION_URL + input.getId().substring(3));
externals.add(KeyValue.newInstance("result type", "publication"));
break;
case "dataset":
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
out.setUrl(Constants.DATASET_URL + input.getId().substring(3));
externals.add(KeyValue.newInstance("result type", "dataset"));
break;
case "software":
@ -59,10 +61,12 @@ public class Mapper implements Serializable {
.ifPresent(
value -> externals.add(KeyValue.newInstance("programming language", value.getClassname())));
out.setUrl(Constants.SOFTWARE_URL + input.getId().substring(3));
externals.add(KeyValue.newInstance("result type", "software"));
break;
case "other":
out.setUrl(Constants.ORP_URL + input.getId().substring(3));
externals.add(KeyValue.newInstance("result type", "other"));
break;
}
@ -100,13 +104,10 @@ public class Mapper implements Serializable {
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> {
Iterator<Field<String>> it = value.iterator();
if (it.hasNext()) {
out.setName(it.next().getValue());
}
it.forEachRemaining(v -> externals.add(KeyValue.newInstance("description", v.getValue())));
});
.ifPresent(value ->
getDescription(out, externals, value)
);
Optional
.ofNullable(input.getEmbargoenddate())
@ -133,7 +134,6 @@ public class Mapper implements Serializable {
.ofNullable(v.getUrl())
.ifPresent(u -> u.forEach(url -> urlSet.add(url)));
}));
Optional
@ -180,7 +180,6 @@ public class Mapper implements Serializable {
.add(
KeyValue
.newInstance("subject", s.getQualifier().getClassid() + ":" + s.getValue()))));
externals.add(KeyValue.newInstance("resource type", input.getResourcetype().getClassid()));
cfSet.forEach(cf -> externals.add(KeyValue.newInstance("collected from", cf)));
hbSet.forEach(hb -> externals.add(KeyValue.newInstance("hosted by", hb)));
@ -193,31 +192,13 @@ public class Mapper implements Serializable {
return out;
}
private static eu.dnetlib.dhp.schema.dump.oaf.Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
eu.dnetlib.dhp.schema.dump.oaf.Author a = new eu.dnetlib.dhp.schema.dump.oaf.Author();
Optional
.ofNullable(oa.getAffiliation())
.ifPresent(
value -> a
.setAffiliation(
value
.stream()
.map(aff -> aff.getValue())
.collect(Collectors.toList())));
a.setFullname(oa.getFullname());
a.setName(oa.getName());
a.setSurname(oa.getSurname());
a.setRank(oa.getRank());
Optional
.ofNullable(oa.getPid())
.ifPresent(
value -> a
.setPid(
value
.stream()
.map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
return a;
private static void getDescription(CatalogueEntry out, List<KeyValue> externals, List<Field<String>> value) {
Iterator<Field<String>> it = value.iterator();
if (it.hasNext()) {
out.setNotes(it.next().getValue());
}
it.forEachRemaining(v -> externals.add(KeyValue.newInstance("description", v.getValue())));
}
}

View File

@ -92,7 +92,7 @@ public class SparkDumpRISISCatalogue implements Serializable {
value -> execMap(value, communityName),
Encoders.bean(eu.dnetlib.dhp.schema.dump.gcat.CatalogueEntry.class))
.filter(Objects::nonNull)
.repartition(1)
.repartition(1)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")

View File

@ -122,7 +122,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--communityName</arg><arg>${communityName}</arg>
</spark>
<ok to="join_dump"/>
@ -148,7 +148,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--communityName</arg><arg>${communityName}</arg>
</spark>
<ok to="join_dump"/>
@ -174,7 +174,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--communityName</arg><arg>${communityName}</arg>
</spark>
<ok to="join_dump"/>
@ -200,7 +200,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--communityName</arg><arg>${communityName}</arg>
</spark>
<ok to="join_dump"/>

View File

@ -69,17 +69,17 @@ public class DumpJobTest {
}
@Test
public void testDataset() throws Exception {
public void testSoftware() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/dataset.json")
.getResource("/eu/dnetlib/dhp/oa/graph/dump/gcat/software.json")
.getPath();
SparkDumpRISISCatalogue.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/result",
"-sourcePath", sourcePath,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
"-communityName", "risis"
});

File diff suppressed because one or more lines are too long