forked from D-Net/dnet-hadoop
merge branch with master
This commit is contained in:
commit
ae08b3c0dd
|
@ -5,6 +5,8 @@ import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import net.sf.saxon.expr.XPathContext;
|
import net.sf.saxon.expr.XPathContext;
|
||||||
import net.sf.saxon.om.Sequence;
|
import net.sf.saxon.om.Sequence;
|
||||||
import net.sf.saxon.trans.XPathException;
|
import net.sf.saxon.trans.XPathException;
|
||||||
|
@ -19,6 +21,8 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
||||||
|
|
||||||
private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
|
private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
|
||||||
|
|
||||||
|
public static final String BLANK = "";
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getName() {
|
public String getName() {
|
||||||
return "normalizeDate";
|
return "normalizeDate";
|
||||||
|
@ -27,10 +31,10 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
||||||
@Override
|
@Override
|
||||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||||
if (arguments == null | arguments.length == 0) {
|
if (arguments == null | arguments.length == 0) {
|
||||||
return new StringValue("");
|
return new StringValue(BLANK);
|
||||||
}
|
}
|
||||||
String s = arguments[0].head().getStringValue();
|
String s = arguments[0].head().getStringValue();
|
||||||
return new StringValue(_year(s));
|
return new StringValue(_normalizeDate(s));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -55,8 +59,8 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
||||||
return SequenceType.SINGLE_STRING;
|
return SequenceType.SINGLE_STRING;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String _year(String s) {
|
private String _normalizeDate(String s) {
|
||||||
final String date = s != null ? s.trim() : "";
|
final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK;
|
||||||
|
|
||||||
for (String format : normalizeDateFormats) {
|
for (String format : normalizeDateFormats) {
|
||||||
try {
|
try {
|
||||||
|
@ -66,6 +70,6 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
||||||
} catch (ParseException e) {
|
} catch (ParseException e) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return "";
|
return BLANK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -194,10 +194,10 @@ public class SparkDedupTest implements Serializable {
|
||||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(3432, orgs_simrel);
|
assertEquals(3082, orgs_simrel);
|
||||||
assertEquals(7152, pubs_simrel);
|
assertEquals(7036, pubs_simrel);
|
||||||
assertEquals(344, sw_simrel);
|
assertEquals(344, sw_simrel);
|
||||||
assertEquals(458, ds_simrel);
|
assertEquals(442, ds_simrel);
|
||||||
assertEquals(6750, orp_simrel);
|
assertEquals(6750, orp_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -343,8 +343,8 @@ public class SparkDedupTest implements Serializable {
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(1276, orgs_mergerel);
|
assertEquals(1272, orgs_mergerel);
|
||||||
assertEquals(1442, pubs_mergerel);
|
assertEquals(1438, pubs_mergerel);
|
||||||
assertEquals(288, sw_mergerel);
|
assertEquals(288, sw_mergerel);
|
||||||
assertEquals(472, ds_mergerel);
|
assertEquals(472, ds_mergerel);
|
||||||
assertEquals(718, orp_mergerel);
|
assertEquals(718, orp_mergerel);
|
||||||
|
@ -390,10 +390,10 @@ public class SparkDedupTest implements Serializable {
|
||||||
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(82, orgs_deduprecord);
|
assertEquals(85, orgs_deduprecord);
|
||||||
assertEquals(66, pubs_deduprecord);
|
assertEquals(65, pubs_deduprecord);
|
||||||
assertEquals(51, sw_deduprecord);
|
assertEquals(51, sw_deduprecord);
|
||||||
assertEquals(96, ds_deduprecord);
|
assertEquals(97, ds_deduprecord);
|
||||||
assertEquals(89, orp_deduprecord);
|
assertEquals(89, orp_deduprecord);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -473,12 +473,12 @@ public class SparkDedupTest implements Serializable {
|
||||||
.distinct()
|
.distinct()
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(897, publications);
|
assertEquals(896, publications);
|
||||||
assertEquals(835, organizations);
|
assertEquals(838, organizations);
|
||||||
assertEquals(100, projects);
|
assertEquals(100, projects);
|
||||||
assertEquals(100, datasource);
|
assertEquals(100, datasource);
|
||||||
assertEquals(200, softwares);
|
assertEquals(200, softwares);
|
||||||
assertEquals(388, dataset);
|
assertEquals(389, dataset);
|
||||||
assertEquals(517, otherresearchproduct);
|
assertEquals(517, otherresearchproduct);
|
||||||
|
|
||||||
long deletedOrgs = jsc
|
long deletedOrgs = jsc
|
||||||
|
@ -533,7 +533,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
|
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
|
||||||
|
|
||||||
assertEquals(4866, relations);
|
assertEquals(4858, relations);
|
||||||
|
|
||||||
// check deletedbyinference
|
// check deletedbyinference
|
||||||
final Dataset<Relation> mergeRels = spark
|
final Dataset<Relation> mergeRels = spark
|
||||||
|
|
|
@ -168,10 +168,10 @@ public class SparkStatsTest implements Serializable {
|
||||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(121, orgs_blocks);
|
assertEquals(549, orgs_blocks);
|
||||||
assertEquals(110, pubs_blocks);
|
assertEquals(299, pubs_blocks);
|
||||||
assertEquals(21, sw_blocks);
|
assertEquals(122, sw_blocks);
|
||||||
assertEquals(67, ds_blocks);
|
assertEquals(186, ds_blocks);
|
||||||
assertEquals(55, orp_blocks);
|
assertEquals(170, orp_blocks);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,9 +6,10 @@ import java.util.Objects;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.clearspring.analytics.util.Lists;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.clearspring.analytics.util.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
|
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
@ -144,22 +145,29 @@ public class CleaningFunctions {
|
||||||
author.setRank(i++);
|
author.setRank(i++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for(Author a : r.getAuthor()) {
|
for (Author a : r.getAuthor()) {
|
||||||
if (Objects.isNull(a.getPid())) {
|
if (Objects.isNull(a.getPid())) {
|
||||||
a.setPid(Lists.newArrayList());
|
a.setPid(Lists.newArrayList());
|
||||||
} else {
|
} else {
|
||||||
a.setPid(
|
a
|
||||||
a.getPid().stream()
|
.setPid(
|
||||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
a
|
||||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
.getPid()
|
||||||
.map(p -> {
|
.stream()
|
||||||
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
.filter(p -> Objects.nonNull(p.getQualifier()))
|
||||||
return p;
|
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||||
})
|
.map(p -> {
|
||||||
.collect(Collectors.toMap(StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, LinkedHashMap::new))
|
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
||||||
.values()
|
return p;
|
||||||
.stream()
|
})
|
||||||
.collect(Collectors.toList()));
|
.collect(
|
||||||
|
Collectors
|
||||||
|
.toMap(
|
||||||
|
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
|
||||||
|
LinkedHashMap::new))
|
||||||
|
.values()
|
||||||
|
.stream()
|
||||||
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -38,13 +38,11 @@ import java.io.IOException;
|
||||||
import java.sql.Array;
|
import java.sql.Array;
|
||||||
import java.sql.ResultSet;
|
import java.sql.ResultSet;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
@ -197,7 +195,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
||||||
final Datasource ds = new Datasource();
|
final Datasource ds = new Datasource();
|
||||||
|
|
||||||
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
|
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
|
||||||
ds.setOriginalId(Arrays.asList((String[]) rs.getArray("identities").getArray()));
|
ds
|
||||||
|
.setOriginalId(
|
||||||
|
Arrays
|
||||||
|
.asList(
|
||||||
|
(String[]) rs.getArray("identities").getArray())
|
||||||
|
.stream()
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
ds
|
ds
|
||||||
.setCollectedfrom(
|
.setCollectedfrom(
|
||||||
listKeyValues(
|
listKeyValues(
|
||||||
|
@ -243,7 +248,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
||||||
ds.setCertificates(field(rs.getString("certificates"), info));
|
ds.setCertificates(field(rs.getString("certificates"), info));
|
||||||
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
|
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
|
||||||
ds
|
ds
|
||||||
.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal
|
.setJournal(
|
||||||
|
journal(
|
||||||
|
rs.getString("officialname"),
|
||||||
|
rs.getString("issnPrinted"),
|
||||||
|
rs.getString("issnOnline"),
|
||||||
|
rs.getString("issnLinking"),
|
||||||
|
info)); // Journal
|
||||||
ds.setDataInfo(info);
|
ds.setDataInfo(info);
|
||||||
ds.setLastupdatetimestamp(lastUpdateTimestamp);
|
ds.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
|
||||||
|
@ -567,21 +578,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Journal prepareJournal(final String name, final String sj, final DataInfo info) {
|
private Journal prepareJournal(final ResultSet rs, final DataInfo info) throws SQLException {
|
||||||
if (StringUtils.isNotBlank(sj)) {
|
if (Objects.isNull(rs)) {
|
||||||
final String[] arr = sj.split("@@@");
|
return null;
|
||||||
if (arr.length == 3) {
|
} else {
|
||||||
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null;
|
|
||||||
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null;
|
|
||||||
|
|
||||||
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null;
|
return journal(
|
||||||
|
rs.getString("officialname"), rs.getString("issnPrinted"), rs.getString("issnOnline"),
|
||||||
if (issn != null || eissn != null || lissn != null) {
|
rs.getString("issnLinking"), info);
|
||||||
return journal(name, issn, eissn, lissn, null, null, null, null, null, null, null, info);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -153,6 +153,27 @@ public class OafMapperUtils {
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Journal journal(
|
||||||
|
final String name,
|
||||||
|
final String issnPrinted,
|
||||||
|
final String issnOnline,
|
||||||
|
final String issnLinking,
|
||||||
|
final DataInfo dataInfo) {
|
||||||
|
return journal(
|
||||||
|
name,
|
||||||
|
issnPrinted,
|
||||||
|
issnOnline,
|
||||||
|
issnLinking,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
dataInfo);
|
||||||
|
}
|
||||||
|
|
||||||
public static Journal journal(
|
public static Journal journal(
|
||||||
final String name,
|
final String name,
|
||||||
final String issnPrinted,
|
final String issnPrinted,
|
||||||
|
|
|
@ -84,8 +84,10 @@ SELECT
|
||||||
dc.id AS collectedfromid,
|
dc.id AS collectedfromid,
|
||||||
dc.officialname AS collectedfromname,
|
dc.officialname AS collectedfromname,
|
||||||
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
|
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
|
||||||
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
|
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
|
||||||
concat_ws(' @@@ ', d.issn, d.eissn, d.lissn) AS journal
|
d.issn AS issnPrinted,
|
||||||
|
d.eissn AS issnOnline,
|
||||||
|
d.lissn AS issnLinking
|
||||||
|
|
||||||
FROM dsm_datasources d
|
FROM dsm_datasources d
|
||||||
|
|
||||||
|
|
|
@ -80,9 +80,9 @@ public class MigrateDbEntitiesApplicationTest {
|
||||||
assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue());
|
assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue());
|
||||||
assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue());
|
assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue());
|
||||||
assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName());
|
assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName());
|
||||||
assertEquals("2579-5449", ds.getJournal().getIssnPrinted());
|
assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted());
|
||||||
assertEquals("2597-6540", ds.getJournal().getIssnOnline());
|
assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline());
|
||||||
assertEquals(null, ds.getJournal().getIssnLinking());
|
assertEquals(getValueAsString("issnLinking", fields), ds.getJournal().getIssnLinking());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -228,8 +228,18 @@
|
||||||
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
|
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "journal",
|
"field": "issnPrinted",
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"value": "2579-5449 @@@ 2597-6540 @@@ "
|
"value": "2579-5449"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "issnOnline",
|
||||||
|
"type": "string",
|
||||||
|
"value": "2579-5448"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "issnLinking",
|
||||||
|
"type": "string",
|
||||||
|
"value": "2579-5447"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
2
pom.xml
2
pom.xml
|
@ -328,7 +328,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib</groupId>
|
<groupId>eu.dnetlib</groupId>
|
||||||
<artifactId>dnet-pace-core</artifactId>
|
<artifactId>dnet-pace-core</artifactId>
|
||||||
<version>4.0.4</version>
|
<version>4.0.5</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib</groupId>
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
|
Loading…
Reference in New Issue