Adding H2020 Classification, topic code and topic description to H2020 projects #46

Merged
claudio.atzori merged 59 commits from miriam.baglioni/dnet-hadoop:h2020classification into master 2020-10-05 14:14:39 +02:00
10 changed files with 113 additions and 63 deletions
Showing only changes of commit ae08b3c0dd - Show all commits

View File

@ -5,6 +5,8 @@ import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.lang3.StringUtils;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException;
@ -19,6 +21,8 @@ public class NormalizeDate extends AbstractExtensionFunction {
private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
public static final String BLANK = "";
@Override
public String getName() {
return "normalizeDate";
@ -27,10 +31,10 @@ public class NormalizeDate extends AbstractExtensionFunction {
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
return new StringValue("");
return new StringValue(BLANK);
}
String s = arguments[0].head().getStringValue();
return new StringValue(_year(s));
return new StringValue(_normalizeDate(s));
}
@Override
@ -55,8 +59,8 @@ public class NormalizeDate extends AbstractExtensionFunction {
return SequenceType.SINGLE_STRING;
}
private String _year(String s) {
final String date = s != null ? s.trim() : "";
private String _normalizeDate(String s) {
final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK;
for (String format : normalizeDateFormats) {
try {
@ -66,6 +70,6 @@ public class NormalizeDate extends AbstractExtensionFunction {
} catch (ParseException e) {
}
}
return "";
return BLANK;
}
}

View File

@ -194,10 +194,10 @@ public class SparkDedupTest implements Serializable {
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
.count();
assertEquals(3432, orgs_simrel);
assertEquals(7152, pubs_simrel);
assertEquals(3082, orgs_simrel);
assertEquals(7036, pubs_simrel);
assertEquals(344, sw_simrel);
assertEquals(458, ds_simrel);
assertEquals(442, ds_simrel);
assertEquals(6750, orp_simrel);
}
@ -343,8 +343,8 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.count();
assertEquals(1276, orgs_mergerel);
assertEquals(1442, pubs_mergerel);
assertEquals(1272, orgs_mergerel);
assertEquals(1438, pubs_mergerel);
assertEquals(288, sw_mergerel);
assertEquals(472, ds_mergerel);
assertEquals(718, orp_mergerel);
@ -390,10 +390,10 @@ public class SparkDedupTest implements Serializable {
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
.count();
assertEquals(82, orgs_deduprecord);
assertEquals(66, pubs_deduprecord);
assertEquals(85, orgs_deduprecord);
assertEquals(65, pubs_deduprecord);
assertEquals(51, sw_deduprecord);
assertEquals(96, ds_deduprecord);
assertEquals(97, ds_deduprecord);
assertEquals(89, orp_deduprecord);
}
@ -473,12 +473,12 @@ public class SparkDedupTest implements Serializable {
.distinct()
.count();
assertEquals(897, publications);
assertEquals(835, organizations);
assertEquals(896, publications);
assertEquals(838, organizations);
assertEquals(100, projects);
assertEquals(100, datasource);
assertEquals(200, softwares);
assertEquals(388, dataset);
assertEquals(389, dataset);
assertEquals(517, otherresearchproduct);
long deletedOrgs = jsc
@ -533,7 +533,7 @@ public class SparkDedupTest implements Serializable {
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
assertEquals(4866, relations);
assertEquals(4858, relations);
// check deletedbyinference
final Dataset<Relation> mergeRels = spark

View File

@ -168,10 +168,10 @@ public class SparkStatsTest implements Serializable {
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
.count();
assertEquals(121, orgs_blocks);
assertEquals(110, pubs_blocks);
assertEquals(21, sw_blocks);
assertEquals(67, ds_blocks);
assertEquals(55, orp_blocks);
assertEquals(549, orgs_blocks);
assertEquals(299, pubs_blocks);
assertEquals(122, sw_blocks);
assertEquals(186, ds_blocks);
assertEquals(170, orp_blocks);
}
}

View File

@ -6,9 +6,10 @@ import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;
import com.clearspring.analytics.util.Lists;
import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists;
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -148,15 +149,22 @@ public class CleaningFunctions {
if (Objects.isNull(a.getPid())) {
a.setPid(Lists.newArrayList());
} else {
a.setPid(
a.getPid().stream()
a
.setPid(
a
.getPid()
.stream()
.filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
return p;
})
.collect(Collectors.toMap(StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, LinkedHashMap::new))
.collect(
Collectors
.toMap(
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
LinkedHashMap::new))
.values()
.stream()
.collect(Collectors.toList()));

View File

@ -38,13 +38,11 @@ import java.io.IOException;
import java.sql.Array;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.*;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
@ -197,7 +195,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final Datasource ds = new Datasource();
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
ds.setOriginalId(Arrays.asList((String[]) rs.getArray("identities").getArray()));
ds
.setOriginalId(
Arrays
.asList(
(String[]) rs.getArray("identities").getArray())
.stream()
.filter(StringUtils::isNotBlank)
.collect(Collectors.toList()));
ds
.setCollectedfrom(
listKeyValues(
@ -243,7 +248,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
ds.setCertificates(field(rs.getString("certificates"), info));
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
ds
.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal
.setJournal(
journal(
rs.getString("officialname"),
rs.getString("issnPrinted"),
rs.getString("issnOnline"),
rs.getString("issnLinking"),
info)); // Journal
ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp);
@ -567,21 +578,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return res;
}
private Journal prepareJournal(final String name, final String sj, final DataInfo info) {
if (StringUtils.isNotBlank(sj)) {
final String[] arr = sj.split("@@@");
if (arr.length == 3) {
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null;
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null;
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null;
if (issn != null || eissn != null || lissn != null) {
return journal(name, issn, eissn, lissn, null, null, null, null, null, null, null, info);
}
}
}
private Journal prepareJournal(final ResultSet rs, final DataInfo info) throws SQLException {
if (Objects.isNull(rs)) {
return null;
} else {
return journal(
rs.getString("officialname"), rs.getString("issnPrinted"), rs.getString("issnOnline"),
rs.getString("issnLinking"), info);
}
}
@Override

View File

@ -153,6 +153,27 @@ public class OafMapperUtils {
return p;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final DataInfo dataInfo) {
return journal(
name,
issnPrinted,
issnOnline,
issnLinking,
null,
null,
null,
null,
null,
null,
null,
dataInfo);
}
public static Journal journal(
final String name,
final String issnPrinted,

View File

@ -85,7 +85,9 @@ SELECT
dc.officialname AS collectedfromname,
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
concat_ws(' @@@ ', d.issn, d.eissn, d.lissn) AS journal
d.issn AS issnPrinted,
d.eissn AS issnOnline,
d.lissn AS issnLinking
FROM dsm_datasources d

View File

@ -80,9 +80,9 @@ public class MigrateDbEntitiesApplicationTest {
assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue());
assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue());
assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName());
assertEquals("2579-5449", ds.getJournal().getIssnPrinted());
assertEquals("2597-6540", ds.getJournal().getIssnOnline());
assertEquals(null, ds.getJournal().getIssnLinking());
assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted());
assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline());
assertEquals(getValueAsString("issnLinking", fields), ds.getJournal().getIssnLinking());
}
@Test

View File

@ -228,8 +228,18 @@
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
},
{
"field": "journal",
"field": "issnPrinted",
"type": "string",
"value": "2579-5449 @@@ 2597-6540 @@@ "
"value": "2579-5449"
},
{
"field": "issnOnline",
"type": "string",
"value": "2579-5448"
},
{
"field": "issnLinking",
"type": "string",
"value": "2579-5447"
}
]

View File

@ -328,7 +328,7 @@
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
<version>4.0.4</version>
<version>4.0.5</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>