merge branch with master

This commit is contained in:
Miriam Baglioni 2020-10-05 11:35:55 +02:00
commit ae08b3c0dd
10 changed files with 113 additions and 63 deletions

View File

@ -5,6 +5,8 @@ import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import org.apache.commons.lang3.StringUtils;
import net.sf.saxon.expr.XPathContext; import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Sequence; import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException; import net.sf.saxon.trans.XPathException;
@ -19,6 +21,8 @@ public class NormalizeDate extends AbstractExtensionFunction {
private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'"; private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
public static final String BLANK = "";
@Override @Override
public String getName() { public String getName() {
return "normalizeDate"; return "normalizeDate";
@ -27,10 +31,10 @@ public class NormalizeDate extends AbstractExtensionFunction {
@Override @Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) { if (arguments == null | arguments.length == 0) {
return new StringValue(""); return new StringValue(BLANK);
} }
String s = arguments[0].head().getStringValue(); String s = arguments[0].head().getStringValue();
return new StringValue(_year(s)); return new StringValue(_normalizeDate(s));
} }
@Override @Override
@ -55,8 +59,8 @@ public class NormalizeDate extends AbstractExtensionFunction {
return SequenceType.SINGLE_STRING; return SequenceType.SINGLE_STRING;
} }
private String _year(String s) { private String _normalizeDate(String s) {
final String date = s != null ? s.trim() : ""; final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK;
for (String format : normalizeDateFormats) { for (String format : normalizeDateFormats) {
try { try {
@ -66,6 +70,6 @@ public class NormalizeDate extends AbstractExtensionFunction {
} catch (ParseException e) { } catch (ParseException e) {
} }
} }
return ""; return BLANK;
} }
} }

View File

@ -194,10 +194,10 @@ public class SparkDedupTest implements Serializable {
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
.count(); .count();
assertEquals(3432, orgs_simrel); assertEquals(3082, orgs_simrel);
assertEquals(7152, pubs_simrel); assertEquals(7036, pubs_simrel);
assertEquals(344, sw_simrel); assertEquals(344, sw_simrel);
assertEquals(458, ds_simrel); assertEquals(442, ds_simrel);
assertEquals(6750, orp_simrel); assertEquals(6750, orp_simrel);
} }
@ -343,8 +343,8 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.count(); .count();
assertEquals(1276, orgs_mergerel); assertEquals(1272, orgs_mergerel);
assertEquals(1442, pubs_mergerel); assertEquals(1438, pubs_mergerel);
assertEquals(288, sw_mergerel); assertEquals(288, sw_mergerel);
assertEquals(472, ds_mergerel); assertEquals(472, ds_mergerel);
assertEquals(718, orp_mergerel); assertEquals(718, orp_mergerel);
@ -390,10 +390,10 @@ public class SparkDedupTest implements Serializable {
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
.count(); .count();
assertEquals(82, orgs_deduprecord); assertEquals(85, orgs_deduprecord);
assertEquals(66, pubs_deduprecord); assertEquals(65, pubs_deduprecord);
assertEquals(51, sw_deduprecord); assertEquals(51, sw_deduprecord);
assertEquals(96, ds_deduprecord); assertEquals(97, ds_deduprecord);
assertEquals(89, orp_deduprecord); assertEquals(89, orp_deduprecord);
} }
@ -473,12 +473,12 @@ public class SparkDedupTest implements Serializable {
.distinct() .distinct()
.count(); .count();
assertEquals(897, publications); assertEquals(896, publications);
assertEquals(835, organizations); assertEquals(838, organizations);
assertEquals(100, projects); assertEquals(100, projects);
assertEquals(100, datasource); assertEquals(100, datasource);
assertEquals(200, softwares); assertEquals(200, softwares);
assertEquals(388, dataset); assertEquals(389, dataset);
assertEquals(517, otherresearchproduct); assertEquals(517, otherresearchproduct);
long deletedOrgs = jsc long deletedOrgs = jsc
@ -533,7 +533,7 @@ public class SparkDedupTest implements Serializable {
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
assertEquals(4866, relations); assertEquals(4858, relations);
// check deletedbyinference // check deletedbyinference
final Dataset<Relation> mergeRels = spark final Dataset<Relation> mergeRels = spark

View File

@ -168,10 +168,10 @@ public class SparkStatsTest implements Serializable {
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats") .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
.count(); .count();
assertEquals(121, orgs_blocks); assertEquals(549, orgs_blocks);
assertEquals(110, pubs_blocks); assertEquals(299, pubs_blocks);
assertEquals(21, sw_blocks); assertEquals(122, sw_blocks);
assertEquals(67, ds_blocks); assertEquals(186, ds_blocks);
assertEquals(55, orp_blocks); assertEquals(170, orp_blocks);
} }
} }

View File

@ -6,9 +6,10 @@ import java.util.Objects;
import java.util.function.Function; import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.clearspring.analytics.util.Lists;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists;
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -144,22 +145,29 @@ public class CleaningFunctions {
author.setRank(i++); author.setRank(i++);
} }
} }
for(Author a : r.getAuthor()) { for (Author a : r.getAuthor()) {
if (Objects.isNull(a.getPid())) { if (Objects.isNull(a.getPid())) {
a.setPid(Lists.newArrayList()); a.setPid(Lists.newArrayList());
} else { } else {
a.setPid( a
a.getPid().stream() .setPid(
.filter(p -> Objects.nonNull(p.getQualifier())) a
.filter(p -> StringUtils.isNotBlank(p.getValue())) .getPid()
.map(p -> { .stream()
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); .filter(p -> Objects.nonNull(p.getQualifier()))
return p; .filter(p -> StringUtils.isNotBlank(p.getValue()))
}) .map(p -> {
.collect(Collectors.toMap(StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, LinkedHashMap::new)) p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
.values() return p;
.stream() })
.collect(Collectors.toList())); .collect(
Collectors
.toMap(
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
LinkedHashMap::new))
.values()
.stream()
.collect(Collectors.toList()));
} }
} }

View File

@ -38,13 +38,11 @@ import java.io.IOException;
import java.sql.Array; import java.sql.Array;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.Predicate; import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -197,7 +195,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final Datasource ds = new Datasource(); final Datasource ds = new Datasource();
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
ds.setOriginalId(Arrays.asList((String[]) rs.getArray("identities").getArray())); ds
.setOriginalId(
Arrays
.asList(
(String[]) rs.getArray("identities").getArray())
.stream()
.filter(StringUtils::isNotBlank)
.collect(Collectors.toList()));
ds ds
.setCollectedfrom( .setCollectedfrom(
listKeyValues( listKeyValues(
@ -243,7 +248,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
ds.setCertificates(field(rs.getString("certificates"), info)); ds.setCertificates(field(rs.getString("certificates"), info));
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
ds ds
.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal .setJournal(
journal(
rs.getString("officialname"),
rs.getString("issnPrinted"),
rs.getString("issnOnline"),
rs.getString("issnLinking"),
info)); // Journal
ds.setDataInfo(info); ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp); ds.setLastupdatetimestamp(lastUpdateTimestamp);
@ -567,21 +578,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return res; return res;
} }
private Journal prepareJournal(final String name, final String sj, final DataInfo info) { private Journal prepareJournal(final ResultSet rs, final DataInfo info) throws SQLException {
if (StringUtils.isNotBlank(sj)) { if (Objects.isNull(rs)) {
final String[] arr = sj.split("@@@"); return null;
if (arr.length == 3) { } else {
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null;
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null;
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null; return journal(
rs.getString("officialname"), rs.getString("issnPrinted"), rs.getString("issnOnline"),
if (issn != null || eissn != null || lissn != null) { rs.getString("issnLinking"), info);
return journal(name, issn, eissn, lissn, null, null, null, null, null, null, null, info);
}
}
} }
return null;
} }
@Override @Override

View File

@ -153,6 +153,27 @@ public class OafMapperUtils {
return p; return p;
} }
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final DataInfo dataInfo) {
return journal(
name,
issnPrinted,
issnOnline,
issnLinking,
null,
null,
null,
null,
null,
null,
null,
dataInfo);
}
public static Journal journal( public static Journal journal(
final String name, final String name,
final String issnPrinted, final String issnPrinted,

View File

@ -84,8 +84,10 @@ SELECT
dc.id AS collectedfromid, dc.id AS collectedfromid,
dc.officialname AS collectedfromname, dc.officialname AS collectedfromname,
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype, d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction, 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
concat_ws(' @@@ ', d.issn, d.eissn, d.lissn) AS journal d.issn AS issnPrinted,
d.eissn AS issnOnline,
d.lissn AS issnLinking
FROM dsm_datasources d FROM dsm_datasources d

View File

@ -80,9 +80,9 @@ public class MigrateDbEntitiesApplicationTest {
assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue()); assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue());
assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue()); assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue());
assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName()); assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName());
assertEquals("2579-5449", ds.getJournal().getIssnPrinted()); assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted());
assertEquals("2597-6540", ds.getJournal().getIssnOnline()); assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline());
assertEquals(null, ds.getJournal().getIssnLinking()); assertEquals(getValueAsString("issnLinking", fields), ds.getJournal().getIssnLinking());
} }
@Test @Test

View File

@ -228,8 +228,18 @@
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions" "value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
}, },
{ {
"field": "journal", "field": "issnPrinted",
"type": "string", "type": "string",
"value": "2579-5449 @@@ 2597-6540 @@@ " "value": "2579-5449"
},
{
"field": "issnOnline",
"type": "string",
"value": "2579-5448"
},
{
"field": "issnLinking",
"type": "string",
"value": "2579-5447"
} }
] ]

View File

@ -328,7 +328,7 @@
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId> <artifactId>dnet-pace-core</artifactId>
<version>4.0.4</version> <version>4.0.5</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>