Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop

2020-06-16 15:53:39 +02:00 · 2020-06-16 15:53:39 +02:00 · 113c9b1de0
parent 76ea7607f7 5441f01586
commit 113c9b1de0
8 changed files with 1415 additions and 1418 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@ -126,9 +126,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 		for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
 			url.add(((Node) o).getText().trim());
 		}
+		for (final Object o : doc
+			.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='landingPage']")) {
+			url.add(((Node) o).getText().trim());
+		}
 		for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='URL']")) {
 			url.add(((Node) o).getText().trim());
 		}
+		for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='landingPage']")) {
+			url.add(((Node) o).getText().trim());
+		}
 		for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='DOI']")) {
 			url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim());
 		}
@ -367,11 +374,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 		res
 			.addAll(
 				prepareListStructPropsWithValidQualifier(
-					doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", DNET_PID_TYPES, info));
+					doc, "//datacite:identifier[@identifierType != 'URL' and @identifierType != 'landingPage']",
+					"@identifierType", DNET_PID_TYPES, info));
 		res
 			.addAll(
 				prepareListStructPropsWithValidQualifier(
-					doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']",
+					doc,
+					"//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']",
 					"@alternateIdentifierType", DNET_PID_TYPES, info));
 		return res;
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java
@ -61,7 +61,7 @@ public class Vocabulary implements Serializable {
 	}

 	public VocabularyTerm getTermBySynonym(final String syn) {
-		return getTerm(synonyms.get(syn));
+		return getTerm(synonyms.get(syn.toLowerCase()));
 	}

 	public Qualifier getTermAsQualifier(final String termId) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java
@ -3,7 +3,6 @@ package eu.dnetlib.dhp.oa.graph.raw.common;

 import java.io.Serializable;
 import java.util.*;
-import java.util.function.Supplier;
 import java.util.stream.Collectors;

 import org.apache.commons.lang3.StringUtils;
@ -46,7 +45,7 @@ public class VocabularyGroup implements Serializable {
 				}

 				vocs.addTerm(vocId, termId, termName);
-				vocs.addSynonyms(vocId, termId, termId);
+				// vocs.addSynonyms(vocId, termId, termId);
 			}
 		}

@ -58,7 +57,7 @@ public class VocabularyGroup implements Serializable {
 				final String syn = arr[2].trim();

 				vocs.addSynonyms(vocId, termId, syn);
-				vocs.addSynonyms(vocId, termId, termId);
+				// vocs.addSynonyms(vocId, termId, termId);
 			}
 		}

@ -135,7 +134,7 @@ public class VocabularyGroup implements Serializable {
 		Optional
 			.ofNullable(vocs.get(id))
 			.orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId))
-			.addSynonym(syn, termId);
+			.addSynonym(syn.toLowerCase(), termId);
 	}

 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/postprocessing.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/postprocessing.sql
@ -1,10 +1,10 @@
 DROP VIEW IF EXISTS ${hiveDbName}.result;

 CREATE VIEW IF NOT EXISTS result as
-    select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hiveDbName}.publication p
+    select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.publication p
    union all
-    select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hiveDbName}.dataset d
+    select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.dataset d
    union all
-    select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hiveDbName}.software s
+    select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.software s
    union all
-    select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hiveDbName}.otherresearchproduct o;
+    select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.otherresearchproduct o;
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
@ -60,8 +60,8 @@ public class CleaningFunctionTest {

 		assertNotNull(p_out);

-		assertEquals("eng", p_out.getLanguage().getClassid());
-		assertEquals("English", p_out.getLanguage().getClassname());
+		assertEquals("und", p_out.getLanguage().getClassid());
+		assertEquals("Undetermined", p_out.getLanguage().getClassname());

 		assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
 		assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json
@ -281,8 +281,8 @@
        "value": "VIRTA"
      },
      "instancetype": {
-        "classid": "Comentario",
-        "classname": "Comentario",
+        "classid": "Comment/debate",
+        "classname": "Comment/debate",
        "schemeid": "dnet:publication_resource",
        "schemename": "dnet:publication_resource"
      },
@ -317,8 +317,8 @@
    "vol": ""
  },
  "language": {
-    "classid": "en",
-    "classname": "en",
+    "classid": "UNKNOWN",
+    "classname": "UNKNOWN",
    "schemeid": "dnet:languages",
    "schemename": "dnet:languages"
  },
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt