Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop

This commit is contained in:
Michele Artini 2020-06-16 15:53:39 +02:00
commit 113c9b1de0
8 changed files with 1415 additions and 1418 deletions

View File

@ -126,9 +126,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
url.add(((Node) o).getText().trim()); url.add(((Node) o).getText().trim());
} }
for (final Object o : doc
.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='landingPage']")) {
url.add(((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='URL']")) { for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='URL']")) {
url.add(((Node) o).getText().trim()); url.add(((Node) o).getText().trim());
} }
for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='landingPage']")) {
url.add(((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='DOI']")) { for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='DOI']")) {
url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim()); url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim());
} }
@ -367,11 +374,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
res res
.addAll( .addAll(
prepareListStructPropsWithValidQualifier( prepareListStructPropsWithValidQualifier(
doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", DNET_PID_TYPES, info)); doc, "//datacite:identifier[@identifierType != 'URL' and @identifierType != 'landingPage']",
"@identifierType", DNET_PID_TYPES, info));
res res
.addAll( .addAll(
prepareListStructPropsWithValidQualifier( prepareListStructPropsWithValidQualifier(
doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", doc,
"//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']",
"@alternateIdentifierType", DNET_PID_TYPES, info)); "@alternateIdentifierType", DNET_PID_TYPES, info));
return res; return res;
} }

View File

@ -61,7 +61,7 @@ public class Vocabulary implements Serializable {
} }
public VocabularyTerm getTermBySynonym(final String syn) { public VocabularyTerm getTermBySynonym(final String syn) {
return getTerm(synonyms.get(syn)); return getTerm(synonyms.get(syn.toLowerCase()));
} }
public Qualifier getTermAsQualifier(final String termId) { public Qualifier getTermAsQualifier(final String termId) {

View File

@ -3,7 +3,6 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.function.Supplier;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -46,7 +45,7 @@ public class VocabularyGroup implements Serializable {
} }
vocs.addTerm(vocId, termId, termName); vocs.addTerm(vocId, termId, termName);
vocs.addSynonyms(vocId, termId, termId); // vocs.addSynonyms(vocId, termId, termId);
} }
} }
@ -58,7 +57,7 @@ public class VocabularyGroup implements Serializable {
final String syn = arr[2].trim(); final String syn = arr[2].trim();
vocs.addSynonyms(vocId, termId, syn); vocs.addSynonyms(vocId, termId, syn);
vocs.addSynonyms(vocId, termId, termId); // vocs.addSynonyms(vocId, termId, termId);
} }
} }
@ -135,7 +134,7 @@ public class VocabularyGroup implements Serializable {
Optional Optional
.ofNullable(vocs.get(id)) .ofNullable(vocs.get(id))
.orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId)) .orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId))
.addSynonym(syn, termId); .addSynonym(syn.toLowerCase(), termId);
} }
} }

View File

@ -1,10 +1,10 @@
DROP VIEW IF EXISTS ${hiveDbName}.result; DROP VIEW IF EXISTS ${hiveDbName}.result;
CREATE VIEW IF NOT EXISTS result as CREATE VIEW IF NOT EXISTS result as
select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hiveDbName}.publication p select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.publication p
union all union all
select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hiveDbName}.dataset d select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.dataset d
union all union all
select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hiveDbName}.software s select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.software s
union all union all
select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hiveDbName}.otherresearchproduct o; select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.otherresearchproduct o;

View File

@ -60,8 +60,8 @@ public class CleaningFunctionTest {
assertNotNull(p_out); assertNotNull(p_out);
assertEquals("eng", p_out.getLanguage().getClassid()); assertEquals("und", p_out.getLanguage().getClassid());
assertEquals("English", p_out.getLanguage().getClassname()); assertEquals("Undetermined", p_out.getLanguage().getClassname());
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid()); assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname()); assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());

View File

@ -281,8 +281,8 @@
"value": "VIRTA" "value": "VIRTA"
}, },
"instancetype": { "instancetype": {
"classid": "Comentario", "classid": "Comment/debate",
"classname": "Comentario", "classname": "Comment/debate",
"schemeid": "dnet:publication_resource", "schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource" "schemename": "dnet:publication_resource"
}, },
@ -317,8 +317,8 @@
"vol": "" "vol": ""
}, },
"language": { "language": {
"classid": "en", "classid": "UNKNOWN",
"classname": "en", "classname": "UNKNOWN",
"schemeid": "dnet:languages", "schemeid": "dnet:languages",
"schemename": "dnet:languages" "schemename": "dnet:languages"
}, },