orcid / orcid_pending cleaning backported from master branch

2021-06-14 09:40:50 +02:00 · 2021-06-14 09:40:50 +02:00 · 2039bb9f5f
parent dd19c4ac5a
commit 2039bb9f5f
6 changed files with 67 additions and 189 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -26,8 +26,9 @@ import eu.dnetlib.dhp.schema.oaf.*;
 public class GraphCleaningFunctions extends CleaningFunctions {
 	public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
 	public static final int ORCID_LEN = 19;
 	public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
 	public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
 	public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
 	public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
 	public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
@ -281,7 +282,27 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 				}
 			}
 			if (Objects.nonNull(r.getAuthor())) {
-				final List<Author> authors = Lists.newArrayList();
+				r
 					.setAuthor(
 						r
 							.getAuthor()
 							.stream()
 							.filter(a -> Objects.nonNull(a))
 							.filter(a -> StringUtils.isNotBlank(a.getFullname()))
 							.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
 							.collect(Collectors.toList()));
 				boolean nullRank = r
 					.getAuthor()
 					.stream()
 					.anyMatch(a -> Objects.isNull(a.getRank()));
 				if (nullRank) {
 					int i = 1;
 					for (Author author : r.getAuthor()) {
 						author.setRank(i++);
 					}
 				}
 				for (Author a : r.getAuthor()) {
 					if (Objects.isNull(a.getPid())) {
 						a.setPid(Lists.newArrayList());
@ -295,41 +316,53 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 									.filter(p -> Objects.nonNull(p.getQualifier()))
 									.filter(p -> StringUtils.isNotBlank(p.getValue()))
 									.map(p -> {
-										p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
+										// hack to distinguish orcid from orcid_pending
 										String pidProvenance = Optional
 											.ofNullable(p.getDataInfo())
 											.map(
 												d -> Optional
 													.ofNullable(d.getProvenanceaction())
 													.map(Qualifier::getClassid)
 													.orElse(""))
 											.orElse("");
 										if (p
 											.getQualifier()
 											.getClassid()
 											.toLowerCase()
 											.contains(ModelConstants.ORCID)) {
 											if (pidProvenance
 												.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
 												p.getQualifier().setClassid(ModelConstants.ORCID);
 											} else {
 												p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
 											}
 											final String orcid = p
 												.getValue()
 												.trim()
 												.toLowerCase()
 												.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
 											if (orcid.length() == ORCID_LEN) {
 												p.setValue(orcid);
 											} else {
 												p.setValue("");
 											}
 										}
 										return p;
 									})
 									.filter(p -> StringUtils.isNotBlank(p.getValue()))
 									.collect(
 										Collectors
 											.toMap(
-												StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
+												p -> p.getQualifier().getClassid() + p.getValue(),
 												Function.identity(),
 												(p1, p2) -> p1,
 												LinkedHashMap::new))
 									.values()
 									.stream()
 									.collect(Collectors.toList()));
 					}
 					if (StringUtils.isBlank(a.getFullname())) {
 						if (StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname())) {
 							a.setFullname(a.getSurname() + ", " + a.getName());
 				}
 			}
 					if (StringUtils.isNotBlank(a.getFullname()) && isValidAuthorName(a)) {
 						authors.add(a);
 					}
 				}
 				boolean nullRank = authors
 					.stream()
 					.anyMatch(a -> Objects.isNull(a.getRank()));
 				if (nullRank) {
 					int i = 1;
 					for (Author author : authors) {
 						author.setRank(i++);
 					}
 				}
 				r.setAuthor(authors);
 			}
 			if (value instanceof Publication) {
 			} else if (value instanceof Dataset) {
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
@ -131,6 +131,11 @@ public class OafMapperUtilsTest {
 	}
 	@Test
 	public void testDate() {
 		System.out.println(GraphCleaningFunctions.cleanDate("23-FEB-1998"));
 	}
 	@Test
 	public void testMergePubs() throws IOException {
 		Publication p1 = read("publication_1.json", Publication.class);
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -76,7 +76,9 @@ public abstract class AbstractMdRecordToOafMapper {
 	protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
 	protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/";
 	protected static final Qualifier ORCID_PID_TYPE = qualifier(
-		"ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES);
+		ModelConstants.ORCID_PENDING,
 		ModelConstants.ORCID_CLASSNAME,
 		DNET_PID_TYPES, DNET_PID_TYPES);
 	protected static final Qualifier MAG_PID_TYPE = qualifier(
 		"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java
@ -102,11 +102,6 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
 			.mapToPair(xml -> new Tuple2<>(new Text(UUID.randomUUID() + ":" + type), new Text(xml)))
 			// .coalesce(1)
 			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
 		/*
 		 * .foreach(xml -> { try { writer.append(new Text(UUID.randomUUID() + ":" + type), new Text(xml)); } catch
 		 * (final Exception e) { throw new RuntimeException(e); } });
 		 */
 	}
 	private static String enrichRecord(final Row r) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow-bak.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow-bak.xml
@ -1,157 +0,0 @@
 <workflow-app name="Test Import of Hdfs Stores" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>graphOutputPath</name>
            <description>the target path to store raw graph</description>
        </property>
        <property>
            <name>contentPath</name>
            <description>path location to store (or reuse) content from the aggregator</description>
        </property>
        <property>
            <name>mdstoreManagerUrl</name>
            <description>the address of the Mdstore Manager</description>
        </property>
 		<property>
            <name>isLookupUrl</name>
            <description>the address of the lookUp service</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
            <description>oozie action sharelib for spark 2.*</description>
        </property>
        <property>
            <name>spark2ExtraListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
            <description>spark 2.* extra listeners classname</description>
        </property>
        <property>
            <name>spark2SqlQueryExecutionListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
            <description>spark 2.* sql query execution listeners classname</description>
        </property>
        <property>
            <name>spark2YarnHistoryServerAddress</name>
            <description>spark 2.* yarn history server address</description>
        </property>
        <property>
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>mapreduce.job.queuename</name>
                <value>${queueName}</value>
            </property>
            <property>
                <name>oozie.launcher.mapred.job.queue.name</name>
                <value>${oozieLauncherQueueName}</value>
            </property>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="ImportODF_hdfs"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="ImportODF_hdfs">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>ImportODF_hdfs</name>
            <class>eu.dnetlib.dhp.oa.graph.raw.MigrateHdfsMdstoresApplication</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores ${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--hdfsPath</arg><arg>${contentPath}/odf_records_hdfs</arg>
            <arg>--mdstoreManagerUrl</arg><arg>${mdstoreManagerUrl}</arg>
            <arg>--mdFormat</arg><arg>ODF</arg>
            <arg>--mdLayout</arg><arg>store</arg>
            <arg>--mdInterpretation</arg><arg>cleaned</arg>
        </spark>
        <ok to="GenerateEntities"/>
        <error to="Kill"/>
    </action>
 	<action name="GenerateEntities">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>GenerateEntities</name>
            <class>eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores ${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePaths</arg><arg>${contentPath}/odf_records_hdfs</arg>
            <arg>--targetPath</arg><arg>${workingDir}/entities</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--shouldHashId</arg><arg>${shouldHashId}</arg>
        </spark>
        <ok to="GenerateGraph"/>
        <error to="Kill"/>
    </action>
    <action name="GenerateGraph">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>GenerateGraph</name>
            <class>eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores ${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/entities</arg>
            <arg>--graphRawPath</arg><arg>${workingDir}/graph_raw</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@ -195,8 +195,8 @@ public class MappersTest {
 			.findFirst()
 			.get();
 		assertEquals("0000-0001-6651-1178", pid.getValue());
-		assertEquals("ORCID", pid.getQualifier().getClassid());
+		assertEquals(ModelConstants.ORCID_PENDING, pid.getQualifier().getClassid());
-		assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname());
+		assertEquals(ModelConstants.ORCID_CLASSNAME, pid.getQualifier().getClassname());
 		assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
 		assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
 		assertEquals("Votsi,Nefta", author.get().getFullname());