Merge branch 'ticket_8369' of https://code-repo.d4science.org/D-Net/dnet-hadoop into ticket_8369

This commit is contained in:
Claudio Atzori 2023-02-14 15:59:18 +01:00
commit 6b0a08e29c
10 changed files with 129 additions and 133 deletions

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId> <artifactId>dhp-code-style</artifactId>
<version>1.2.5-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
<packaging>jar</packaging> <packaging>jar</packaging>

View File

@ -7,7 +7,8 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class ModelConstants { public class ModelConstants {
private ModelConstants() {} private ModelConstants() {
}
public static final String ORCID = "orcid"; public static final String ORCID = "orcid";
public static final String ORCID_PENDING = "orcid_pending"; public static final String ORCID_PENDING = "orcid_pending";
@ -80,7 +81,6 @@ public class ModelConstants {
public static final String PROVENANCE_DEDUP = "sysimport:dedup"; public static final String PROVENANCE_DEDUP = "sysimport:dedup";
public static final String PROVENANCE_ENRICH = "sysimport:enrich"; public static final String PROVENANCE_ENRICH = "sysimport:enrich";
public static final Qualifier PROVENANCE_ACTION_SET_QUALIFIER = qualifier( public static final Qualifier PROVENANCE_ACTION_SET_QUALIFIER = qualifier(
SYSIMPORT_ACTIONSET, SYSIMPORT_ACTIONSET, DNET_PROVENANCE_ACTIONS); SYSIMPORT_ACTIONSET, SYSIMPORT_ACTIONSET, DNET_PROVENANCE_ACTIONS);
@ -127,8 +127,6 @@ public class ModelConstants {
public static final String IS_REQUIRED_BY = "IsRequiredBy"; public static final String IS_REQUIRED_BY = "IsRequiredBy";
public static final String REQUIRES = "Requires"; public static final String REQUIRES = "Requires";
public static final String CITATION = "citation"; // subreltype public static final String CITATION = "citation"; // subreltype
public static final String CITES = "Cites"; public static final String CITES = "Cites";
public static final String IS_CITED_BY = "IsCitedBy"; public static final String IS_CITED_BY = "IsCitedBy";
@ -219,7 +217,7 @@ public class ModelConstants {
"main title", "main title", DNET_DATACITE_TITLE); "main title", "main title", DNET_DATACITE_TITLE);
public static final Qualifier ALTERNATIVE_TITLE_QUALIFIER = qualifier( public static final Qualifier ALTERNATIVE_TITLE_QUALIFIER = qualifier(
"alternative title", "alternative title", DNET_DATACITE_TITLE); "alternative title", "alternative title", DNET_DATACITE_TITLE);
private static final Qualifier SUBTITLE_QUALIFIER = qualifier("subtitle", "subtitle", DNET_DATACITE_TITLE); private static final Qualifier SUBTITLE_QUALIFIER = qualifier("subtitle", "subtitle", DNET_DATACITE_TITLE);

View File

@ -209,7 +209,8 @@ public class ModelSupport {
return idPrefixMap.get(clazz); return idPrefixMap.get(clazz);
} }
public static <X extends Oaf, Y extends Oaf, Z extends Oaf> Boolean sameClass(X left, Y right, Class<Z> superClazz) { public static <X extends Oaf, Y extends Oaf, Z extends Oaf> Boolean sameClass(X left, Y right,
Class<Z> superClazz) {
return isSubClass(left, superClazz) && isSubClass(right, superClazz); return isSubClass(left, superClazz) && isSubClass(right, superClazz);
} }

View File

@ -31,10 +31,10 @@ public class MergeUtils {
return mergeRelation(left, right); return mergeRelation(left, right);
} else { } else {
throw new RuntimeException( throw new RuntimeException(
String String
.format( .format(
"MERGE_FROM_AND_GET incompatible types: %s, %s", "MERGE_FROM_AND_GET incompatible types: %s, %s",
left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); left.getClass().getCanonicalName(), right.getClass().getCanonicalName()));
} }
} }
@ -53,10 +53,10 @@ public class MergeUtils {
return mergeProject(left, right); return mergeProject(left, right);
} else { } else {
throw new RuntimeException( throw new RuntimeException(
String String
.format( .format(
"MERGE_FROM_AND_GET incompatible types: %s, %s", "MERGE_FROM_AND_GET incompatible types: %s, %s",
left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); left.getClass().getCanonicalName(), right.getClass().getCanonicalName()));
} }
} }
@ -110,8 +110,8 @@ public class MergeUtils {
mergedEntity.setLastupdatetimestamp(enrich.getLastupdatetimestamp()); mergedEntity.setLastupdatetimestamp(enrich.getLastupdatetimestamp());
} else if (mergedEntity.getLastupdatetimestamp() != null && enrich.getLastupdatetimestamp() != null) { } else if (mergedEntity.getLastupdatetimestamp() != null && enrich.getLastupdatetimestamp() != null) {
mergedEntity mergedEntity
.setLastupdatetimestamp( .setLastupdatetimestamp(
Long.max(mergedEntity.getLastupdatetimestamp(), enrich.getLastupdatetimestamp())); Long.max(mergedEntity.getLastupdatetimestamp(), enrich.getLastupdatetimestamp()));
} }
mergedEntity.setPid(mergeLists(mergedEntity.getPid(), enrich.getPid())); mergedEntity.setPid(mergeLists(mergedEntity.getPid(), enrich.getPid()));
@ -138,7 +138,7 @@ public class MergeUtils {
checkArgument(Objects.equals(original.getTarget(), enrich.getTarget()), "target ids must be equal"); checkArgument(Objects.equals(original.getTarget(), enrich.getTarget()), "target ids must be equal");
checkArgument(Objects.equals(original.getRelType(), enrich.getRelType()), "relType(s) must be equal"); checkArgument(Objects.equals(original.getRelType(), enrich.getRelType()), "relType(s) must be equal");
checkArgument( checkArgument(
Objects.equals(original.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal"); Objects.equals(original.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal");
checkArgument(Objects.equals(original.getRelClass(), enrich.getRelClass()), "relClass(es) must be equal"); checkArgument(Objects.equals(original.getRelClass(), enrich.getRelClass()), "relClass(es) must be equal");
original.setProvenance(mergeLists(original.getProvenance(), enrich.getProvenance())); original.setProvenance(mergeLists(original.getProvenance(), enrich.getProvenance()));
@ -148,10 +148,10 @@ public class MergeUtils {
original.setValidationDate(ModelSupport.oldest(original.getValidationDate(), enrich.getValidationDate())); original.setValidationDate(ModelSupport.oldest(original.getValidationDate(), enrich.getValidationDate()));
} catch (ParseException e) { } catch (ParseException e) {
throw new IllegalArgumentException(String throw new IllegalArgumentException(String
.format( .format(
"invalid validation date format in relation [s:%s, t:%s]: %s", original.getSource(), "invalid validation date format in relation [s:%s, t:%s]: %s", original.getSource(),
original.getTarget(), original.getTarget(),
original.getValidationDate())); original.getValidationDate()));
} }
return (T) original; return (T) original;
@ -370,7 +370,7 @@ public class MergeUtils {
private static <T extends Oaf> T mergePublication(T original, T enrich) { private static <T extends Oaf> T mergePublication(T original, T enrich) {
//add publication specific fields. // add publication specific fields.
mergeEntityDataInfo(original, enrich); mergeEntityDataInfo(original, enrich);

View File

@ -363,7 +363,8 @@ public class OafMapperUtils {
final Entity entity, final Entity entity,
final String validationDate) { final String validationDate) {
final List<Provenance> provenance = getProvenance(entity.getCollectedfrom(), fromEntityDataInfo(entity.getDataInfo())); final List<Provenance> provenance = getProvenance(
entity.getCollectedfrom(), fromEntityDataInfo(entity.getDataInfo()));
return getRelation( return getRelation(
source, target, relType, subRelType, relClass, provenance, validationDate, null); source, target, relType, subRelType, relClass, provenance, validationDate, null);
} }

View File

@ -1,8 +1,13 @@
package eu.dnetlib.dhp.common.vocabulary; package eu.dnetlib.dhp.common.vocabulary;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import static org.mockito.Mockito.lenient;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
@ -12,73 +17,63 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import java.io.IOException; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import java.util.Collections; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import java.util.List; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.util.Objects;
import static org.mockito.Mockito.lenient;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
public class VocabularyTest { public class VocabularyTest {
@Mock
protected ISLookUpService isLookUpService;
@Mock protected VocabularyGroup vocabularies;
protected ISLookUpService isLookUpService;
protected VocabularyGroup vocabularies; @BeforeEach
public void setUpVocabulary() throws ISLookUpException, IOException {
@BeforeEach lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
public void setUpVocabulary() throws ISLookUpException, IOException {
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); lenient()
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
.thenReturn(synonyms());
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
}
lenient() private static List<String> vocs() throws IOException {
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) return IOUtils
.thenReturn(synonyms()); .readLines(
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); Objects
} .requireNonNull(
VocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")));
}
private static List<String> vocs() throws IOException { private static List<String> synonyms() throws IOException {
return IOUtils return IOUtils
.readLines( .readLines(
Objects Objects
.requireNonNull( .requireNonNull(
VocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt"))); VocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")));
} }
private static List<String> synonyms() throws IOException { @Test
return IOUtils void testVocabularyMatch() throws Exception {
.readLines( final String s = IOUtils.toString(this.getClass().getResourceAsStream("terms"));
Objects
.requireNonNull(
VocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")));
}
for (String s1 : s.split("\n")) {
@Test final Qualifier t1 = vocabularies.getSynonymAsQualifier("dnet:publication_resource", s1);
void testVocabularyMatch () throws Exception{
final String s= IOUtils.toString(this.getClass().getResourceAsStream("terms"));
for (String s1 : s.split("\n")) { if (t1 == null) {
System.err.println(s1 + " Missing");
} else {
System.out.println("syn=" + s1 + " term = " + t1.getClassid());
final Qualifier t1 = vocabularies.getSynonymAsQualifier("dnet:publication_resource", s1); System.out
.println(
vocabularies.getSynonymAsQualifier("dnet:result_typologies", t1.getClassid()).getClassname());
}
}
if (t1 == null) { }
System.err.println(s1+ " Missing");
}
else {
System.out.println("syn=" + s1 + " term = " + t1.getClassid());
System.out.println(vocabularies.getSynonymAsQualifier("dnet:result_typologies", t1.getClassid()).getClassname());
}
}
}
} }

View File

@ -13,8 +13,8 @@ import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
case class CrossrefDT(doi: String, json: String, timestamp: Long) {} case class CrossrefDT(doi: String, json: String, timestamp: Long) {}
object CrossrefUtility { object CrossrefUtility {
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)" val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
val DOI_PREFIX = "10." val DOI_PREFIX = "10."
@ -37,7 +37,6 @@ object CrossrefUtility {
ret ret
} }
def extractDate(dt: String, datePart: List[List[Int]]): String = { def extractDate(dt: String, datePart: List[List[Int]]): String = {
if (StringUtils.isNotBlank(dt)) if (StringUtils.isNotBlank(dt))
return GraphCleaningFunctions.cleanDate(dt) return GraphCleaningFunctions.cleanDate(dt)
@ -60,36 +59,35 @@ object CrossrefUtility {
} }
private def generateDate( private def generateDate(
dt: String, dt: String,
datePart: List[List[Int]], datePart: List[List[Int]],
classId: String, classId: String,
schemeId: String schemeId: String
): StructuredProperty = { ): StructuredProperty = {
val dp = extractDate(dt, datePart) val dp = extractDate(dt, datePart)
if (StringUtils.isNotBlank(dp)) if (StringUtils.isNotBlank(dp))
structuredProperty(dp, classId, classId,schemeId) structuredProperty(dp, classId, classId, schemeId)
else else
null null
} }
private def generateItemFromType(objectType: String, vocabularies: VocabularyGroup): (Result, String) = {
private def generateItemFromType(objectType: String, vocabularies:VocabularyGroup): (Result, String) = {
val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, objectType) val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, objectType)
if (term != null) { if (term != null) {
val resourceType = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, term.getClassid).getClassname val resourceType =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, term.getClassid).getClassname
resourceType match { resourceType match {
case "publication" =>(new Publication, resourceType) case "publication" => (new Publication, resourceType)
case "dataset" =>(new Dataset, resourceType) case "dataset" => (new Dataset, resourceType)
case "software" => (new Software, resourceType) case "software" => (new Software, resourceType)
case "otherresearchproduct" =>(new OtherResearchProduct, resourceType) case "otherresearchproduct" => (new OtherResearchProduct, resourceType)
} }
} else } else
null null
} }
def convert(input: String, vocabularies: VocabularyGroup): List[Oaf] = {
def convert(input: String, vocabularies:VocabularyGroup): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)
@ -124,14 +122,13 @@ object CrossrefUtility {
result match { result match {
case publication: Publication => convertPublication(publication, json, cOBJCategory) case publication: Publication => convertPublication(publication, json, cOBJCategory)
case dataset: Dataset => convertDataset(dataset) case dataset: Dataset => convertDataset(dataset)
} }
resultList = resultList ::: List(result) resultList = resultList ::: List(result)
resultList resultList
} }
def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = { def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -140,8 +137,9 @@ object CrossrefUtility {
result.setPid( result.setPid(
List( List(
structuredProperty(doi, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES) structuredProperty(doi, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES)
).asJava) ).asJava
)
//MAPPING Crossref DOI into OriginalId //MAPPING Crossref DOI into OriginalId
//and Other Original Identifier of dataset like clinical-trial-number //and Other Original Identifier of dataset like clinical-trial-number
@ -149,11 +147,10 @@ object CrossrefUtility {
val alternativeIds: List[String] = for (JString(ids) <- json \ "alternative-id") yield ids val alternativeIds: List[String] = for (JString(ids) <- json \ "alternative-id") yield ids
val tmp = clinicalTrialNumbers ::: alternativeIds ::: List(doi) val tmp = clinicalTrialNumbers ::: alternativeIds ::: List(doi)
result.setOriginalId(tmp.filter(id => id != null).asJava) result.setOriginalId(tmp.filter(id => id != null).asJava)
// Add DataInfo // Add DataInfo
result.setDataInfo(dataInfo(false, false,0.9F,null, false,ModelConstants.REPOSITORY_PROVENANCE_ACTIONS)) result.setDataInfo(dataInfo(false, false, 0.9f, null, false, ModelConstants.REPOSITORY_PROVENANCE_ACTIONS))
result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long]) result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long])
result.setDateofcollection((json \ "indexed" \ "date-time").extract[String]) result.setDateofcollection((json \ "indexed" \ "date-time").extract[String])
@ -167,23 +164,26 @@ object CrossrefUtility {
// TITLE // TITLE
val mainTitles = val mainTitles =
for {JString(title) <- json \ "title" if title.nonEmpty} for { JString(title) <- json \ "title" if title.nonEmpty } yield structuredProperty(
yield title,
structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER) ModelConstants.MAIN_TITLE_QUALIFIER
)
val originalTitles = for { val originalTitles = for {
JString(title) <- json \ "original-title" if title.nonEmpty JString(title) <- json \ "original-title" if title.nonEmpty
} yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER) } yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER)
val shortTitles = for { val shortTitles = for {
JString(title) <- json \ "short-title" if title.nonEmpty JString(title) <- json \ "short-title" if title.nonEmpty
} yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER) } yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER)
val subtitles = val subtitles =
for {JString(title) <- json \ "subtitle" if title.nonEmpty} for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield structuredProperty(
yield structuredProperty(title, ModelConstants.SUBTITLE_QUALIFIER) title,
ModelConstants.SUBTITLE_QUALIFIER
)
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava) result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
// DESCRIPTION // DESCRIPTION
val descriptionList = val descriptionList =
for {JString(description) <- json \ "abstract"} yield description for { JString(description) <- json \ "abstract" } yield description
result.setDescription(descriptionList.asJava) result.setDescription(descriptionList.asJava)
// Source // Source
@ -242,11 +242,9 @@ object CrossrefUtility {
//Mapping Subject //Mapping Subject
val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List()) val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List())
if (subjectList.nonEmpty) { if (subjectList.nonEmpty) {
result.setSubject( result.setSubject(
subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
) )
} }
@ -265,8 +263,8 @@ object CrossrefUtility {
// Mapping instance // Mapping instance
val instance = new Instance() val instance = new Instance()
val license = for { val license = for {
JObject(license) <- json \ "license" JObject(license) <- json \ "license"
JField("URL", JString(lic)) <- license JField("URL", JString(lic)) <- license
JField("content-version", JString(content_version)) <- license JField("content-version", JString(content_version)) <- license
} yield (asField(lic), content_version) } yield (asField(lic), content_version)
val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue)) val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue))

View File

@ -3,20 +3,20 @@ package eu.dnetlib.dhp.crossref
import eu.dnetlib.dhp.application.AbstractScalaApplication import eu.dnetlib.dhp.application.AbstractScalaApplication
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
class GenerateCrossrefDataset (propertyPath: String, args: Array[String], log: Logger) class GenerateCrossrefDataset(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) { extends AbstractScalaApplication(propertyPath, args, log: Logger) {
/** Here all the spark applications runs this method /** Here all the spark applications runs this method
* where the whole logic of the spark node is defined * where the whole logic of the spark node is defined
*/ */
override def run(): Unit = ??? override def run(): Unit = ???
} }
object GenerateCrossrefDataset {
object GenerateCrossrefDataset{ val log: Logger = LoggerFactory.getLogger(getClass)
val log:Logger = LoggerFactory.getLogger(getClass) val propertyPath = "/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
val propertyPath ="/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
new GenerateCrossrefDataset(propertyPath,args, log).initialize().run() new GenerateCrossrefDataset(propertyPath, args, log).initialize().run()
} }
} }

View File

@ -284,11 +284,11 @@ object DataciteToOAFTransformation {
} }
def generateRelation( def generateRelation(
sourceId: String, sourceId: String,
targetId: String, targetId: String,
relClass: String, relClass: String,
collectedFrom: KeyValue, collectedFrom: KeyValue,
di: DataInfo di: DataInfo
): Relation = { ): Relation = {
val r = new Relation val r = new Relation
r.setSource(sourceId) r.setSource(sourceId)

View File

@ -360,10 +360,13 @@ object BioDBToOAF {
val rel = new Relation val rel = new Relation
val provenance = OafMapperUtils.getProvenance(Lists.newArrayList( val provenance = OafMapperUtils.getProvenance(
collectedFrom, Lists.newArrayList(
collectedFromMap("pdb") collectedFrom,
), REL_DATA_INFO) collectedFromMap("pdb")
),
REL_DATA_INFO
)
rel.setProvenance(provenance) rel.setProvenance(provenance)