forked from D-Net/dnet-hadoop
merge branch with master
This commit is contained in:
commit
ae08b3c0dd
|
@ -5,6 +5,8 @@ import java.text.ParseException;
|
|||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
|
@ -19,6 +21,8 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
|||
|
||||
private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
|
||||
|
||||
public static final String BLANK = "";
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "normalizeDate";
|
||||
|
@ -27,10 +31,10 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
|||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
return new StringValue(BLANK);
|
||||
}
|
||||
String s = arguments[0].head().getStringValue();
|
||||
return new StringValue(_year(s));
|
||||
return new StringValue(_normalizeDate(s));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -55,8 +59,8 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
|||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
private String _year(String s) {
|
||||
final String date = s != null ? s.trim() : "";
|
||||
private String _normalizeDate(String s) {
|
||||
final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK;
|
||||
|
||||
for (String format : normalizeDateFormats) {
|
||||
try {
|
||||
|
@ -66,6 +70,6 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
|||
} catch (ParseException e) {
|
||||
}
|
||||
}
|
||||
return "";
|
||||
return BLANK;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -194,10 +194,10 @@ public class SparkDedupTest implements Serializable {
|
|||
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
||||
.count();
|
||||
|
||||
assertEquals(3432, orgs_simrel);
|
||||
assertEquals(7152, pubs_simrel);
|
||||
assertEquals(3082, orgs_simrel);
|
||||
assertEquals(7036, pubs_simrel);
|
||||
assertEquals(344, sw_simrel);
|
||||
assertEquals(458, ds_simrel);
|
||||
assertEquals(442, ds_simrel);
|
||||
assertEquals(6750, orp_simrel);
|
||||
}
|
||||
|
||||
|
@ -343,8 +343,8 @@ public class SparkDedupTest implements Serializable {
|
|||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
||||
.count();
|
||||
|
||||
assertEquals(1276, orgs_mergerel);
|
||||
assertEquals(1442, pubs_mergerel);
|
||||
assertEquals(1272, orgs_mergerel);
|
||||
assertEquals(1438, pubs_mergerel);
|
||||
assertEquals(288, sw_mergerel);
|
||||
assertEquals(472, ds_mergerel);
|
||||
assertEquals(718, orp_mergerel);
|
||||
|
@ -390,10 +390,10 @@ public class SparkDedupTest implements Serializable {
|
|||
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
||||
.count();
|
||||
|
||||
assertEquals(82, orgs_deduprecord);
|
||||
assertEquals(66, pubs_deduprecord);
|
||||
assertEquals(85, orgs_deduprecord);
|
||||
assertEquals(65, pubs_deduprecord);
|
||||
assertEquals(51, sw_deduprecord);
|
||||
assertEquals(96, ds_deduprecord);
|
||||
assertEquals(97, ds_deduprecord);
|
||||
assertEquals(89, orp_deduprecord);
|
||||
}
|
||||
|
||||
|
@ -473,12 +473,12 @@ public class SparkDedupTest implements Serializable {
|
|||
.distinct()
|
||||
.count();
|
||||
|
||||
assertEquals(897, publications);
|
||||
assertEquals(835, organizations);
|
||||
assertEquals(896, publications);
|
||||
assertEquals(838, organizations);
|
||||
assertEquals(100, projects);
|
||||
assertEquals(100, datasource);
|
||||
assertEquals(200, softwares);
|
||||
assertEquals(388, dataset);
|
||||
assertEquals(389, dataset);
|
||||
assertEquals(517, otherresearchproduct);
|
||||
|
||||
long deletedOrgs = jsc
|
||||
|
@ -533,7 +533,7 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
|
||||
|
||||
assertEquals(4866, relations);
|
||||
assertEquals(4858, relations);
|
||||
|
||||
// check deletedbyinference
|
||||
final Dataset<Relation> mergeRels = spark
|
||||
|
|
|
@ -168,10 +168,10 @@ public class SparkStatsTest implements Serializable {
|
|||
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
||||
.count();
|
||||
|
||||
assertEquals(121, orgs_blocks);
|
||||
assertEquals(110, pubs_blocks);
|
||||
assertEquals(21, sw_blocks);
|
||||
assertEquals(67, ds_blocks);
|
||||
assertEquals(55, orp_blocks);
|
||||
assertEquals(549, orgs_blocks);
|
||||
assertEquals(299, pubs_blocks);
|
||||
assertEquals(122, sw_blocks);
|
||||
assertEquals(186, ds_blocks);
|
||||
assertEquals(170, orp_blocks);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,9 +6,10 @@ import java.util.Objects;
|
|||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.clearspring.analytics.util.Lists;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.clearspring.analytics.util.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
@ -144,22 +145,29 @@ public class CleaningFunctions {
|
|||
author.setRank(i++);
|
||||
}
|
||||
}
|
||||
for(Author a : r.getAuthor()) {
|
||||
for (Author a : r.getAuthor()) {
|
||||
if (Objects.isNull(a.getPid())) {
|
||||
a.setPid(Lists.newArrayList());
|
||||
} else {
|
||||
a.setPid(
|
||||
a.getPid().stream()
|
||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.map(p -> {
|
||||
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
||||
return p;
|
||||
})
|
||||
.collect(Collectors.toMap(StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, LinkedHashMap::new))
|
||||
.values()
|
||||
.stream()
|
||||
.collect(Collectors.toList()));
|
||||
a
|
||||
.setPid(
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.map(p -> {
|
||||
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
||||
return p;
|
||||
})
|
||||
.collect(
|
||||
Collectors
|
||||
.toMap(
|
||||
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
|
||||
LinkedHashMap::new))
|
||||
.values()
|
||||
.stream()
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -38,13 +38,11 @@ import java.io.IOException;
|
|||
import java.sql.Array;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -197,7 +195,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
final Datasource ds = new Datasource();
|
||||
|
||||
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
|
||||
ds.setOriginalId(Arrays.asList((String[]) rs.getArray("identities").getArray()));
|
||||
ds
|
||||
.setOriginalId(
|
||||
Arrays
|
||||
.asList(
|
||||
(String[]) rs.getArray("identities").getArray())
|
||||
.stream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toList()));
|
||||
ds
|
||||
.setCollectedfrom(
|
||||
listKeyValues(
|
||||
|
@ -243,7 +248,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
ds.setCertificates(field(rs.getString("certificates"), info));
|
||||
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
|
||||
ds
|
||||
.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal
|
||||
.setJournal(
|
||||
journal(
|
||||
rs.getString("officialname"),
|
||||
rs.getString("issnPrinted"),
|
||||
rs.getString("issnOnline"),
|
||||
rs.getString("issnLinking"),
|
||||
info)); // Journal
|
||||
ds.setDataInfo(info);
|
||||
ds.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
|
@ -567,21 +578,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
return res;
|
||||
}
|
||||
|
||||
private Journal prepareJournal(final String name, final String sj, final DataInfo info) {
|
||||
if (StringUtils.isNotBlank(sj)) {
|
||||
final String[] arr = sj.split("@@@");
|
||||
if (arr.length == 3) {
|
||||
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null;
|
||||
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null;
|
||||
private Journal prepareJournal(final ResultSet rs, final DataInfo info) throws SQLException {
|
||||
if (Objects.isNull(rs)) {
|
||||
return null;
|
||||
} else {
|
||||
|
||||
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null;
|
||||
|
||||
if (issn != null || eissn != null || lissn != null) {
|
||||
return journal(name, issn, eissn, lissn, null, null, null, null, null, null, null, info);
|
||||
}
|
||||
}
|
||||
return journal(
|
||||
rs.getString("officialname"), rs.getString("issnPrinted"), rs.getString("issnOnline"),
|
||||
rs.getString("issnLinking"), info);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -153,6 +153,27 @@ public class OafMapperUtils {
|
|||
return p;
|
||||
}
|
||||
|
||||
public static Journal journal(
|
||||
final String name,
|
||||
final String issnPrinted,
|
||||
final String issnOnline,
|
||||
final String issnLinking,
|
||||
final DataInfo dataInfo) {
|
||||
return journal(
|
||||
name,
|
||||
issnPrinted,
|
||||
issnOnline,
|
||||
issnLinking,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
dataInfo);
|
||||
}
|
||||
|
||||
public static Journal journal(
|
||||
final String name,
|
||||
final String issnPrinted,
|
||||
|
|
|
@ -84,8 +84,10 @@ SELECT
|
|||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
|
||||
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
|
||||
concat_ws(' @@@ ', d.issn, d.eissn, d.lissn) AS journal
|
||||
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
|
||||
d.issn AS issnPrinted,
|
||||
d.eissn AS issnOnline,
|
||||
d.lissn AS issnLinking
|
||||
|
||||
FROM dsm_datasources d
|
||||
|
||||
|
|
|
@ -80,9 +80,9 @@ public class MigrateDbEntitiesApplicationTest {
|
|||
assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue());
|
||||
assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue());
|
||||
assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName());
|
||||
assertEquals("2579-5449", ds.getJournal().getIssnPrinted());
|
||||
assertEquals("2597-6540", ds.getJournal().getIssnOnline());
|
||||
assertEquals(null, ds.getJournal().getIssnLinking());
|
||||
assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted());
|
||||
assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline());
|
||||
assertEquals(getValueAsString("issnLinking", fields), ds.getJournal().getIssnLinking());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -228,8 +228,18 @@
|
|||
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
|
||||
},
|
||||
{
|
||||
"field": "journal",
|
||||
"field": "issnPrinted",
|
||||
"type": "string",
|
||||
"value": "2579-5449 @@@ 2597-6540 @@@ "
|
||||
"value": "2579-5449"
|
||||
},
|
||||
{
|
||||
"field": "issnOnline",
|
||||
"type": "string",
|
||||
"value": "2579-5448"
|
||||
},
|
||||
{
|
||||
"field": "issnLinking",
|
||||
"type": "string",
|
||||
"value": "2579-5447"
|
||||
}
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue