From 2039bb9f5fdbc949d00ffe7fc6d78d8890f0bc15 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 14 Jun 2021 09:40:50 +0200 Subject: [PATCH] orcid / orcid_pending cleaning backported from master branch --- .../oaf/utils/GraphCleaningFunctions.java | 81 ++++++--- .../schema/oaf/utils/OafMapperUtilsTest.java | 5 + .../raw/AbstractMdRecordToOafMapper.java | 4 +- .../raw/MigrateHdfsMdstoresApplication.java | 5 - .../oozie_app/workflow-bak.xml | 157 ------------------ .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 4 +- 6 files changed, 67 insertions(+), 189 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow-bak.xml diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 999272113..7088e56e1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -26,8 +26,9 @@ import eu.dnetlib.dhp.schema.oaf.*; public class GraphCleaningFunctions extends CleaningFunctions { + public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; + public static final int ORCID_LEN = 19; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; - public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]"; public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10; @@ -281,7 +282,27 @@ public class GraphCleaningFunctions extends CleaningFunctions { } } if (Objects.nonNull(r.getAuthor())) { - final List authors = Lists.newArrayList(); + r + .setAuthor( + r + .getAuthor() + .stream() + .filter(a -> Objects.nonNull(a)) + .filter(a -> StringUtils.isNotBlank(a.getFullname())) + .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) + .collect(Collectors.toList())); + + boolean nullRank = r + .getAuthor() + .stream() + .anyMatch(a -> Objects.isNull(a.getRank())); + if (nullRank) { + int i = 1; + for (Author author : r.getAuthor()) { + author.setRank(i++); + } + } + for (Author a : r.getAuthor()) { if (Objects.isNull(a.getPid())) { a.setPid(Lists.newArrayList()); @@ -295,40 +316,52 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(p -> Objects.nonNull(p.getQualifier())) .filter(p -> StringUtils.isNotBlank(p.getValue())) .map(p -> { - p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); + // hack to distinguish orcid from orcid_pending + String pidProvenance = Optional + .ofNullable(p.getDataInfo()) + .map( + d -> Optional + .ofNullable(d.getProvenanceaction()) + .map(Qualifier::getClassid) + .orElse("")) + .orElse(""); + if (p + .getQualifier() + .getClassid() + .toLowerCase() + .contains(ModelConstants.ORCID)) { + if (pidProvenance + .equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) { + p.getQualifier().setClassid(ModelConstants.ORCID); + } else { + p.getQualifier().setClassid(ModelConstants.ORCID_PENDING); + } + final String orcid = p + .getValue() + .trim() + .toLowerCase() + .replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4"); + if (orcid.length() == ORCID_LEN) { + p.setValue(orcid); + } else { + p.setValue(""); + } + } return p; }) .filter(p -> StringUtils.isNotBlank(p.getValue())) .collect( Collectors .toMap( - StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, + p -> p.getQualifier().getClassid() + p.getValue(), + Function.identity(), + (p1, p2) -> p1, LinkedHashMap::new)) .values() .stream() .collect(Collectors.toList())); } - if (StringUtils.isBlank(a.getFullname())) { - if (StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname())) { - a.setFullname(a.getSurname() + ", " + a.getName()); - } - } - if (StringUtils.isNotBlank(a.getFullname()) && isValidAuthorName(a)) { - authors.add(a); - } } - - boolean nullRank = authors - .stream() - .anyMatch(a -> Objects.isNull(a.getRank())); - if (nullRank) { - int i = 1; - for (Author author : authors) { - author.setRank(i++); - } - } - r.setAuthor(authors); - } if (value instanceof Publication) { diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java index b2cc669fe..eefa1e9a3 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -131,6 +131,11 @@ public class OafMapperUtilsTest { } + @Test + public void testDate() { + System.out.println(GraphCleaningFunctions.cleanDate("23-FEB-1998")); + } + @Test public void testMergePubs() throws IOException { Publication p1 = read("publication_1.json", Publication.class); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 1beb616ba..1e80dfd46 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -76,7 +76,9 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/"; protected static final Qualifier ORCID_PID_TYPE = qualifier( - "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + ModelConstants.ORCID_PENDING, + ModelConstants.ORCID_CLASSNAME, + DNET_PID_TYPES, DNET_PID_TYPES); protected static final Qualifier MAG_PID_TYPE = qualifier( "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java index 77fd28d77..f4e783edc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java @@ -102,11 +102,6 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication .mapToPair(xml -> new Tuple2<>(new Text(UUID.randomUUID() + ":" + type), new Text(xml))) // .coalesce(1) .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); - - /* - * .foreach(xml -> { try { writer.append(new Text(UUID.randomUUID() + ":" + type), new Text(xml)); } catch - * (final Exception e) { throw new RuntimeException(e); } }); - */ } private static String enrichRecord(final Row r) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow-bak.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow-bak.xml deleted file mode 100644 index bfe2dff0b..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow-bak.xml +++ /dev/null @@ -1,157 +0,0 @@ - - - - - graphOutputPath - the target path to store raw graph - - - contentPath - path location to store (or reuse) content from the aggregator - - - mdstoreManagerUrl - the address of the Mdstore Manager - - - isLookupUrl - the address of the lookUp service - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - yarn - cluster - ImportODF_hdfs - eu.dnetlib.dhp.oa.graph.raw.MigrateHdfsMdstoresApplication - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --hdfsPath${contentPath}/odf_records_hdfs - --mdstoreManagerUrl${mdstoreManagerUrl} - --mdFormatODF - --mdLayoutstore - --mdInterpretationcleaned - - - - - - - - yarn - cluster - GenerateEntities - eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePaths${contentPath}/odf_records_hdfs - --targetPath${workingDir}/entities - --isLookupUrl${isLookupUrl} - --shouldHashId${shouldHashId} - - - - - - - - yarn - cluster - GenerateGraph - eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --sourcePath${workingDir}/entities - --graphRawPath${workingDir}/graph_raw - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 08a596db5..5b229a625 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -195,8 +195,8 @@ public class MappersTest { .findFirst() .get(); assertEquals("0000-0001-6651-1178", pid.getValue()); - assertEquals("ORCID", pid.getQualifier().getClassid()); - assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.ORCID_PENDING, pid.getQualifier().getClassid()); + assertEquals(ModelConstants.ORCID_CLASSNAME, pid.getQualifier().getClassname()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); assertEquals("Votsi,Nefta", author.get().getFullname());