diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java index 9fb60e145..1b5f3c40d 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -5,6 +5,8 @@ import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; +import org.apache.commons.lang3.StringUtils; + import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Sequence; import net.sf.saxon.trans.XPathException; @@ -19,6 +21,8 @@ public class NormalizeDate extends AbstractExtensionFunction { private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'"; + public static final String BLANK = ""; + @Override public String getName() { return "normalizeDate"; @@ -27,10 +31,10 @@ public class NormalizeDate extends AbstractExtensionFunction { @Override public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { if (arguments == null | arguments.length == 0) { - return new StringValue(""); + return new StringValue(BLANK); } String s = arguments[0].head().getStringValue(); - return new StringValue(_year(s)); + return new StringValue(_normalizeDate(s)); } @Override @@ -55,8 +59,8 @@ public class NormalizeDate extends AbstractExtensionFunction { return SequenceType.SINGLE_STRING; } - private String _year(String s) { - final String date = s != null ? s.trim() : ""; + private String _normalizeDate(String s) { + final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK; for (String format : normalizeDateFormats) { try { @@ -66,6 +70,6 @@ public class NormalizeDate extends AbstractExtensionFunction { } catch (ParseException e) { } } - return ""; + return BLANK; } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index fb5ebc099..2c1607165 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -194,10 +194,10 @@ public class SparkDedupTest implements Serializable { .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") .count(); - assertEquals(3432, orgs_simrel); - assertEquals(7152, pubs_simrel); + assertEquals(3082, orgs_simrel); + assertEquals(7036, pubs_simrel); assertEquals(344, sw_simrel); - assertEquals(458, ds_simrel); + assertEquals(442, ds_simrel); assertEquals(6750, orp_simrel); } @@ -343,8 +343,8 @@ public class SparkDedupTest implements Serializable { .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") .count(); - assertEquals(1276, orgs_mergerel); - assertEquals(1442, pubs_mergerel); + assertEquals(1272, orgs_mergerel); + assertEquals(1438, pubs_mergerel); assertEquals(288, sw_mergerel); assertEquals(472, ds_mergerel); assertEquals(718, orp_mergerel); @@ -390,10 +390,10 @@ public class SparkDedupTest implements Serializable { testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") .count(); - assertEquals(82, orgs_deduprecord); - assertEquals(66, pubs_deduprecord); + assertEquals(85, orgs_deduprecord); + assertEquals(65, pubs_deduprecord); assertEquals(51, sw_deduprecord); - assertEquals(96, ds_deduprecord); + assertEquals(97, ds_deduprecord); assertEquals(89, orp_deduprecord); } @@ -473,12 +473,12 @@ public class SparkDedupTest implements Serializable { .distinct() .count(); - assertEquals(897, publications); - assertEquals(835, organizations); + assertEquals(896, publications); + assertEquals(838, organizations); assertEquals(100, projects); assertEquals(100, datasource); assertEquals(200, softwares); - assertEquals(388, dataset); + assertEquals(389, dataset); assertEquals(517, otherresearchproduct); long deletedOrgs = jsc @@ -533,7 +533,7 @@ public class SparkDedupTest implements Serializable { long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); - assertEquals(4866, relations); + assertEquals(4858, relations); // check deletedbyinference final Dataset mergeRels = spark diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java index 7e76c284b..31de8d951 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java @@ -168,10 +168,10 @@ public class SparkStatsTest implements Serializable { .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats") .count(); - assertEquals(121, orgs_blocks); - assertEquals(110, pubs_blocks); - assertEquals(21, sw_blocks); - assertEquals(67, ds_blocks); - assertEquals(55, orp_blocks); + assertEquals(549, orgs_blocks); + assertEquals(299, pubs_blocks); + assertEquals(122, sw_blocks); + assertEquals(186, ds_blocks); + assertEquals(170, orp_blocks); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index f615d69f2..84f88003b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -6,9 +6,10 @@ import java.util.Objects; import java.util.function.Function; import java.util.stream.Collectors; -import com.clearspring.analytics.util.Lists; import org.apache.commons.lang3.StringUtils; +import com.clearspring.analytics.util.Lists; + import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -144,22 +145,29 @@ public class CleaningFunctions { author.setRank(i++); } } - for(Author a : r.getAuthor()) { + for (Author a : r.getAuthor()) { if (Objects.isNull(a.getPid())) { a.setPid(Lists.newArrayList()); } else { - a.setPid( - a.getPid().stream() - .filter(p -> Objects.nonNull(p.getQualifier())) - .filter(p -> StringUtils.isNotBlank(p.getValue())) - .map(p -> { - p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); - return p; - }) - .collect(Collectors.toMap(StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, LinkedHashMap::new)) - .values() - .stream() - .collect(Collectors.toList())); + a + .setPid( + a + .getPid() + .stream() + .filter(p -> Objects.nonNull(p.getQualifier())) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .map(p -> { + p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); + return p; + }) + .collect( + Collectors + .toMap( + StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, + LinkedHashMap::new)) + .values() + .stream() + .collect(Collectors.toList())); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 1e7b56ee9..adf7b92be 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -38,13 +38,11 @@ import java.io.IOException; import java.sql.Array; import java.sql.ResultSet; import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.List; +import java.util.*; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Predicate; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -197,7 +195,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final Datasource ds = new Datasource(); ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); - ds.setOriginalId(Arrays.asList((String[]) rs.getArray("identities").getArray())); + ds + .setOriginalId( + Arrays + .asList( + (String[]) rs.getArray("identities").getArray()) + .stream() + .filter(StringUtils::isNotBlank) + .collect(Collectors.toList())); ds .setCollectedfrom( listKeyValues( @@ -243,7 +248,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i ds.setCertificates(field(rs.getString("certificates"), info)); ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array ds - .setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal + .setJournal( + journal( + rs.getString("officialname"), + rs.getString("issnPrinted"), + rs.getString("issnOnline"), + rs.getString("issnLinking"), + info)); // Journal ds.setDataInfo(info); ds.setLastupdatetimestamp(lastUpdateTimestamp); @@ -567,21 +578,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i return res; } - private Journal prepareJournal(final String name, final String sj, final DataInfo info) { - if (StringUtils.isNotBlank(sj)) { - final String[] arr = sj.split("@@@"); - if (arr.length == 3) { - final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null; - final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null; + private Journal prepareJournal(final ResultSet rs, final DataInfo info) throws SQLException { + if (Objects.isNull(rs)) { + return null; + } else { - final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null; - - if (issn != null || eissn != null || lissn != null) { - return journal(name, issn, eissn, lissn, null, null, null, null, null, null, null, info); - } - } + return journal( + rs.getString("officialname"), rs.getString("issnPrinted"), rs.getString("issnOnline"), + rs.getString("issnLinking"), info); } - return null; } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java index 63db13b8f..84b29e3d4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java @@ -153,6 +153,27 @@ public class OafMapperUtils { return p; } + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final DataInfo dataInfo) { + return journal( + name, + issnPrinted, + issnOnline, + issnLinking, + null, + null, + null, + null, + null, + null, + null, + dataInfo); + } + public static Journal journal( final String name, final String issnPrinted, diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryDatasources.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryDatasources.sql index 7ca672835..f0a4161ab 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryDatasources.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryDatasources.sql @@ -84,8 +84,10 @@ SELECT dc.id AS collectedfromid, dc.officialname AS collectedfromname, d.typology||'@@@dnet:datasource_typologies' AS datasourcetype, - 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction, - concat_ws(' @@@ ', d.issn, d.eissn, d.lissn) AS journal + 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction, + d.issn AS issnPrinted, + d.eissn AS issnOnline, + d.lissn AS issnLinking FROM dsm_datasources d diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 011cc18e6..f663d6095 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -80,9 +80,9 @@ public class MigrateDbEntitiesApplicationTest { assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue()); assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue()); assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName()); - assertEquals("2579-5449", ds.getJournal().getIssnPrinted()); - assertEquals("2597-6540", ds.getJournal().getIssnOnline()); - assertEquals(null, ds.getJournal().getIssnLinking()); + assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted()); + assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline()); + assertEquals(getValueAsString("issnLinking", fields), ds.getJournal().getIssnLinking()); } @Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json index 0f1da7095..8f8aed3a0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json @@ -228,8 +228,18 @@ "value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions" }, { - "field": "journal", + "field": "issnPrinted", "type": "string", - "value": "2579-5449 @@@ 2597-6540 @@@ " + "value": "2579-5449" + }, + { + "field": "issnOnline", + "type": "string", + "value": "2579-5448" + }, + { + "field": "issnLinking", + "type": "string", + "value": "2579-5447" } ] diff --git a/pom.xml b/pom.xml index c955f8832..03c69108d 100644 --- a/pom.xml +++ b/pom.xml @@ -328,7 +328,7 @@ eu.dnetlib dnet-pace-core - 4.0.4 + 4.0.5 eu.dnetlib