forked from D-Net/dnet-hadoop
Compare commits
383 Commits
c777b50995
...
1622b40842
Author | SHA1 | Date |
---|---|---|
Andreas Czerniak | 1622b40842 | |
Andreas Czerniak | 854da6c0e7 | |
Andreas Czerniak | 2d0a4d5b85 | |
Claudio Atzori | 97ad94d7d9 | |
Sandro La Bruzzo | 7a3819144d | |
Claudio Atzori | f0ea2410e5 | |
Claudio Atzori | b37bc277c4 | |
Miriam Baglioni | 42e8f76778 | |
Miriam Baglioni | a7c4d0d16d | |
Miriam Baglioni | 7bf12ad24a | |
Miriam Baglioni | a75fb8c47a | |
Miriam Baglioni | 4d517ed9ec | |
Miriam Baglioni | e7d5a39c03 | |
Claudio Atzori | dbd6fa1d65 | |
Miriam Baglioni | 4993666d73 | |
Sandro La Bruzzo | 1b9e8378b3 | |
Sandro La Bruzzo | 57e2c4b749 | |
Sandro La Bruzzo | b78d2b71f0 | |
Claudio Atzori | 0f2144b5e0 | |
Claudio Atzori | dcd282977c | |
Claudio Atzori | 4f212652ca | |
Sandro La Bruzzo | 0163dadb7f | |
Miriam Baglioni | 904e1c2667 | |
Miriam Baglioni | 064f9bbd87 | |
Miriam Baglioni | 93f26fb742 | |
Miriam Baglioni | b7e450070b | |
Miriam Baglioni | af8a33638d | |
Miriam Baglioni | 639190370a | |
Miriam Baglioni | adccc2346a | |
Claudio Atzori | 908294d86e | |
Claudio Atzori | 58f8998e3d | |
Claudio Atzori | 174c3037e1 | |
Claudio Atzori | 045d767013 | |
Claudio Atzori | cb30770a0b | |
Claudio Atzori | bd59b58efb | |
Claudio Atzori | a6977197b3 | |
Miriam Baglioni | 4c60ee1718 | |
Miriam Baglioni | 92fd69e25d | |
Claudio Atzori | fe7e5f4748 | |
Claudio Atzori | bcea4e3a9b | |
Miriam Baglioni | a706ba0c08 | |
Antonis Lempesis | 81ee654271 | |
Antonis Lempesis | 7551e52e95 | |
Miriam Baglioni | 7a1b440413 | |
Claudio Atzori | 278cf08421 | |
Claudio Atzori | cccb16900c | |
Miriam Baglioni | 2a67ee13ec | |
Miriam Baglioni | 5c4fee3533 | |
Miriam Baglioni | 69e9ea9eeb | |
Miriam Baglioni | 31b26d48ac | |
Miriam Baglioni | bf3a9505e0 | |
Miriam Baglioni | 10579c0dd0 | |
Miriam Baglioni | 6116fc5d40 | |
Miriam Baglioni | b81efb6a9d | |
Miriam Baglioni | 73175ba086 | |
Miriam Baglioni | de6c4c8968 | |
Miriam Baglioni | b352fbe453 | |
Miriam Baglioni | 34ac56565d | |
Miriam Baglioni | 20ef1d657f | |
Miriam Baglioni | 813f856d3f | |
Miriam Baglioni | 2c126ed014 | |
Miriam Baglioni | bf52a1847b | |
Miriam Baglioni | 0807fdb65a | |
Miriam Baglioni | b5e11a3a0a | |
Miriam Baglioni | c5739c4266 | |
Miriam Baglioni | da5f6260aa | |
Miriam Baglioni | 4849270c55 | |
Claudio Atzori | 8d18500069 | |
Miriam Baglioni | 9d19b057b8 | |
Miriam Baglioni | be0acccf42 | |
Miriam Baglioni | 89ea9fa0e1 | |
Antonis Lempesis | 16539d7360 | |
Antonis Lempesis | 3edd661608 | |
Antonis Lempesis | a4c0cbb98c | |
Miriam Baglioni | e24a7f3496 | |
Miriam Baglioni | d1ae219cb4 | |
Miriam Baglioni | 460e6b95d6 | |
Sandro La Bruzzo | 3920d68992 | |
Antonis Lempesis | 58996972d9 | |
dimitrispie | c1cdec09a9 | |
Miriam Baglioni | 3cc1b7b153 | |
Miriam Baglioni | 5e5dfd619c | |
Miriam Baglioni | 63b648b0dd | |
Sandro La Bruzzo | b881ee5ef8 | |
Sandro La Bruzzo | 63952018c0 | |
Sandro La Bruzzo | e5bff64f2e | |
Claudio Atzori | e30e5ac8a8 | |
Claudio Atzori | 1790fa2d44 | |
Miriam Baglioni | 56409d1281 | |
Miriam Baglioni | a3592b463a | |
Miriam Baglioni | 22d4b5619b | |
Miriam Baglioni | 6fb6236cd4 | |
Claudio Atzori | aff3ddc8d2 | |
Miriam Baglioni | 573bd17cbb | |
Miriam Baglioni | 4eb8276493 | |
Miriam Baglioni | 936578aaf1 | |
Miriam Baglioni | 8d755cca80 | |
Claudio Atzori | 98eb292c59 | |
Claudio Atzori | 5e17247bb6 | |
Claudio Atzori | b70ecccea0 | |
Claudio Atzori | 25dc7929a9 | |
Claudio Atzori | eb43eda42a | |
Claudio Atzori | 41c70c607d | |
Alessia Bardi | 8f1e018ceb | |
Alessia Bardi | cba63e9f82 | |
Alessia Bardi | e53228401b | |
Claudio Atzori | adf17452b0 | |
Claudio Atzori | e6e177dda0 | |
Alessia Bardi | 6b5d7688a4 | |
Miriam Baglioni | b113586207 | |
Sandro La Bruzzo | 5d51b3dd4a | |
Miriam Baglioni | d9836f0cf3 | |
Miriam Baglioni | d1df01ff1e | |
Sandro La Bruzzo | ed0c352799 | |
Miriam Baglioni | 96a7d46278 | |
Sandro La Bruzzo | e9f285ec4d | |
Sandro La Bruzzo | bf880e2508 | |
Sandro La Bruzzo | 81bf604059 | |
Sandro La Bruzzo | 7af0bbd0b1 | |
Claudio Atzori | 9132727793 | |
Claudio Atzori | 08795cbd30 | |
Miriam Baglioni | f430688ff7 | |
Miriam Baglioni | 4bb1d43afc | |
Sandro La Bruzzo | 0fa0ce33d6 | |
Sandro La Bruzzo | f7011b90d8 | |
Claudio Atzori | 372633880f | |
Claudio Atzori | dd0b2e5244 | |
Claudio Atzori | c4c705aa46 | |
Claudio Atzori | 863a2f9db3 | |
Claudio Atzori | 9cac283bec | |
Miriam Baglioni | d9f80488cc | |
Miriam Baglioni | 58bc3f223a | |
Miriam Baglioni | 8905a39bf3 | |
Miriam Baglioni | 87eedad898 | |
Claudio Atzori | 3b19821f3c | |
Claudio Atzori | cfa4560769 | |
Claudio Atzori | d85af6fc25 | |
Claudio Atzori | 4fe7888817 | |
Claudio Atzori | 01e5e0142a | |
Claudio Atzori | 0df9574a6f | |
Claudio Atzori | 014e872ae1 | |
Claudio Atzori | 5c6d328537 | |
dimitrispie | 09fc2afdca | |
dimitrispie | 8750a71502 | |
dimitrispie | 25fc8abf77 | |
Antonis Lempesis | 0b4163ee0b | |
dimitrispie | 29f69f2f89 | |
Sandro La Bruzzo | bb7f556eff | |
Sandro La Bruzzo | 1e1f5e4fe0 | |
Miriam Baglioni | ac07ed8251 | |
Miriam Baglioni | 5fd0e610bf | |
Sandro La Bruzzo | feea154e89 | |
Sandro La Bruzzo | 028a8acad8 | |
Sandro La Bruzzo | 2164a2a889 | |
Sandro La Bruzzo | a7cf277d98 | |
Sandro La Bruzzo | 483d3039d1 | |
Sandro La Bruzzo | 93fe8ce8b2 | |
Sandro La Bruzzo | 35e20b0647 | |
Miriam Baglioni | fdb75b180e | |
Miriam Baglioni | 0506fa2654 | |
Sandro La Bruzzo | 6110a2b984 | |
Sandro La Bruzzo | 65ebe1019b | |
Sandro La Bruzzo | 155d8bf83f | |
Sandro La Bruzzo | 3426451d3f | |
Sandro La Bruzzo | 75298ec442 | |
Sandro La Bruzzo | 4542a2338b | |
Claudio Atzori | 90c2a4987e | |
Claudio Atzori | e5a2c596b2 | |
Claudio Atzori | f4538f3c4c | |
Claudio Atzori | 2b46b87f56 | |
Miriam Baglioni | 9fae872181 | |
Sandro La Bruzzo | fc03c99805 | |
Sandro La Bruzzo | 8a7c7d36db | |
Sandro La Bruzzo | 0c0d561bc4 | |
Claudio Atzori | 62fa61f3cf | |
Claudio Atzori | bd9a43cefd | |
Claudio Atzori | 3974fa7dc1 | |
Claudio Atzori | a24b9f8268 | |
Claudio Atzori | c0750fb17c | |
Claudio Atzori | bb5dca7979 | |
Miriam Baglioni | 793b5a8e5f | |
Miriam Baglioni | 5dc5792722 | |
Miriam Baglioni | 0136a8c266 | |
Miriam Baglioni | 1b79c0ee79 | |
Claudio Atzori | 10a32f287f | |
Antonis Lempesis | cb3adb90f4 | |
Antonis Lempesis | c283406829 | |
Claudio Atzori | e0395719d7 | |
Claudio Atzori | 82a4e4efae | |
Miriam Baglioni | 6d4a1c57ee | |
Sandro La Bruzzo | 9c82d670b8 | |
Sandro La Bruzzo | 1f5ee116ed | |
Sandro La Bruzzo | 2fd9ceac13 | |
Sandro La Bruzzo | 60ae874dcb | |
Sandro La Bruzzo | 2506d7a679 | |
Sandro La Bruzzo | cded363b55 | |
Miriam Baglioni | 4094f2bb9a | |
Miriam Baglioni | ec8b0219ff | |
Miriam Baglioni | 2bbece2ca5 | |
Sandro La Bruzzo | 2d67020c59 | |
Claudio Atzori | 49f897ef29 | |
Miriam Baglioni | 28ea532ece | |
Sandro La Bruzzo | 18c1d70ef4 | |
Sandro La Bruzzo | a1cafaf2e3 | |
Miriam Baglioni | 7c96e3fd46 | |
Miriam Baglioni | c7c0c3187b | |
Miriam Baglioni | c6a9f0a1a8 | |
Miriam Baglioni | 99d86134f5 | |
Claudio Atzori | 0a727d325d | |
Claudio Atzori | bafa2990f3 | |
Claudio Atzori | 668ac25224 | |
Claudio Atzori | 7d0a03f607 | |
Claudio Atzori | 941a50a2fc | |
Claudio Atzori | 7c804acda8 | |
Sandro La Bruzzo | efa09057db | |
Sandro La Bruzzo | 48923e46a1 | |
Claudio Atzori | d2c787d416 | |
Claudio Atzori | 975b10b711 | |
Claudio Atzori | 1ecceea788 | |
Miriam Baglioni | 4ec88c718c | |
Miriam Baglioni | 6f1a434e90 | |
Miriam Baglioni | 157d33ebf9 | |
Claudio Atzori | 7b81607035 | |
Miriam Baglioni | 6595135a1a | |
Miriam Baglioni | 43cae4ad88 | |
Miriam Baglioni | b3f9370125 | |
Miriam Baglioni | 92d0e18b55 | |
Miriam Baglioni | 881113743f | |
Miriam Baglioni | 47ccb53c4f | |
Miriam Baglioni | ffb0ce1d59 | |
Miriam Baglioni | 716021546e | |
Claudio Atzori | 1f2a3d1af0 | |
Sandro La Bruzzo | 3469cc2b1d | |
Sandro La Bruzzo | a7763d2492 | |
Miriam Baglioni | b8bdabfae9 | |
Miriam Baglioni | e5498052e8 | |
Miriam Baglioni | 935062edec | |
Antonis Lempesis | 26f086dd64 | |
Claudio Atzori | 8bdca3413f | |
Claudio Atzori | 148289150f | |
Sandro La Bruzzo | 2ca0a436ad | |
Sandro La Bruzzo | 9cb195314f | |
Miriam Baglioni | 6d3c4c4abe | |
Miriam Baglioni | 8cc50ecee0 | |
Miriam Baglioni | 88b73f4f49 | |
Miriam Baglioni | c371b23077 | |
Miriam Baglioni | 9e214ce0eb | |
Sandro La Bruzzo | 6477a40670 | |
Miriam Baglioni | 6f7ca539c6 | |
Miriam Baglioni | a7d50c499b | |
Antonis Lempesis | 91354c6068 | |
Miriam Baglioni | 94918a673c | |
Miriam Baglioni | 4c70201412 | |
Miriam Baglioni | 8442efd8d1 | |
Claudio Atzori | 5681e89544 | |
Miriam Baglioni | a22c29fba1 | |
Miriam Baglioni | c10ff6928c | |
Miriam Baglioni | 0857849a86 | |
Miriam Baglioni | df7ee77c7a | |
Miriam Baglioni | de63d29b6f | |
Miriam Baglioni | d50057b2d9 | |
Miriam Baglioni | edf55395e9 | |
Miriam Baglioni | d97ea82a29 | |
Miriam Baglioni | 96769b4481 | |
Miriam Baglioni | 683fe093cf | |
Miriam Baglioni | b2bb8d9d79 | |
Miriam Baglioni | 779318961c | |
Miriam Baglioni | 2480e590d1 | |
Miriam Baglioni | b9d124bb7c | |
Sandro La Bruzzo | 7bd224f051 | |
Claudio Atzori | 7fa49f6956 | |
Antonis Lempesis | f78afb5ef9 | |
Miriam Baglioni | 2aca6bfa0a | |
Miriam Baglioni | 09f36cffb8 | |
Claudio Atzori | 1225ba0b92 | |
Sandro La Bruzzo | d9cbca83f7 | |
Sandro La Bruzzo | 1be9aa0a5f | |
Sandro La Bruzzo | 4acfa8fa2e | |
Miriam Baglioni | d0ef7d91c5 | |
Sandro La Bruzzo | aafdffa6b3 | |
Sandro La Bruzzo | 034304b33a | |
Claudio Atzori | 6b34ba737e | |
Claudio Atzori | d147295c2f | |
Claudio Atzori | 3702fe478d | |
Sandro La Bruzzo | ac36aa7d1c | |
Sandro La Bruzzo | aeeebd573b | |
Sandro La Bruzzo | ab3a99d3e9 | |
Sandro La Bruzzo | ae4e99a471 | |
Claudio Atzori | 4f8970f8ed | |
Claudio Atzori | 00b78b9c58 | |
Claudio Atzori | c01dd0c925 | |
Miriam Baglioni | 652114c641 | |
Claudio Atzori | d0cf2963f0 | |
Claudio Atzori | 59f76b50d4 | |
Claudio Atzori | bc3372093e | |
Antonis Lempesis | 241dcf6df1 | |
Claudio Atzori | 515e068a78 | |
Claudio Atzori | 512e7b0170 | |
Claudio Atzori | d517c71458 | |
Claudio Atzori | e9157c67aa | |
Claudio Atzori | 98f37c8d81 | |
Claudio Atzori | c8850456e9 | |
Sandro La Bruzzo | c9870c5122 | |
Sandro La Bruzzo | f8329bc110 | |
Claudio Atzori | 7a73010acd | |
Miriam Baglioni | c7f6cd2591 | |
Sandro La Bruzzo | a894d7adf3 | |
miconis | 5f780a6ba1 | |
Miriam Baglioni | 1315952702 | |
Miriam Baglioni | 1cc09adfaa | |
Miriam Baglioni | 76d41602be | |
Miriam Baglioni | 46f82c7c8f | |
Sandro La Bruzzo | 7b15b88d4c | |
Antonis Lempesis | 41ecb1eb61 | |
Antonis Lempesis | 4b7c8dff2d | |
Sandro La Bruzzo | 51a03c0a50 | |
Claudio Atzori | dd568ec88b | |
Claudio Atzori | 14fbf92ad6 | |
Claudio Atzori | b292e4a700 | |
miconis | 995c1eddaf | |
Miriam Baglioni | 5d9cc2452d | |
miconis | 326bf63775 | |
Miriam Baglioni | 16b28494a9 | |
Miriam Baglioni | 63933808d4 | |
Sandro La Bruzzo | f2c8356ccf | |
Sandro La Bruzzo | 7387416e90 | |
Claudio Atzori | 914b3e92cb | |
Sandro La Bruzzo | 511da98d0c | |
Miriam Baglioni | fec40bdd95 | |
Miriam Baglioni | 83f51f1812 | |
Sandro La Bruzzo | 5606014b17 | |
Serafeim Chatzopoulos | 201ce71cc1 | |
Serafeim Chatzopoulos | e468a7b96b | |
Serafeim Chatzopoulos | de81007302 | |
Sandro La Bruzzo | 8f99d2af86 | |
Miriam Baglioni | e653756e3d | |
Miriam Baglioni | 9814c3e700 | |
Miriam Baglioni | c4ccd7b32c | |
Miriam Baglioni | c8321ad31a | |
Miriam Baglioni | 476a4708d6 | |
Miriam Baglioni | 5ec69889db | |
Miriam Baglioni | eedf7c3310 | |
Miriam Baglioni | f2118d771a | |
Miriam Baglioni | a90bac3bc9 | |
Miriam Baglioni | bd0d7bfba7 | |
Miriam Baglioni | 8daaa32e90 | |
Miriam Baglioni | bc9e3a06ba | |
Miriam Baglioni | eff499af9f | |
Miriam Baglioni | 5d70f842eb | |
Miriam Baglioni | c3931557e3 | |
Miriam Baglioni | 6bd1eca7e0 | |
Miriam Baglioni | 73dc082927 | |
Miriam Baglioni | ee13da9258 | |
Miriam Baglioni | 35e395eae8 | |
Miriam Baglioni | 774cdb190e | |
Miriam Baglioni | 886617afd0 | |
Miriam Baglioni | 320cf02d96 | |
Miriam Baglioni | 52ce35d57b | |
Miriam Baglioni | 970b387b8d | |
Miriam Baglioni | eae10c5894 | |
Miriam Baglioni | c028feef4f | |
Miriam Baglioni | d70f8c96fd | |
Miriam Baglioni | 5e38c7f42d | |
Miriam Baglioni | d418c309f5 | |
Miriam Baglioni | 618d2de2da | |
Miriam Baglioni | 59615da65e | |
Miriam Baglioni | 084b4ef999 | |
Miriam Baglioni | 8f322a73cb | |
Miriam Baglioni | 72397ea1ba | |
Miriam Baglioni | 5295d10691 | |
Miriam Baglioni | e9a17ec899 | |
Miriam Baglioni | 8429aed6c6 | |
Miriam Baglioni | 39b1a6edf6 | |
Miriam Baglioni | 9a58f1b93d | |
Miriam Baglioni | 13c66e16be | |
Miriam Baglioni | 6410ab71d8 | |
Miriam Baglioni | 65a242646d | |
Miriam Baglioni | 4b432fbee8 | |
Miriam Baglioni | 87a6e2b967 | |
Miriam Baglioni | 69fd40fd30 | |
Miriam Baglioni | 86e50f7311 | |
Miriam Baglioni | da88c850c6 | |
Miriam Baglioni | 2f66fedfec |
|
@ -3,8 +3,6 @@
|
|||
*.iws
|
||||
*.ipr
|
||||
*.iml
|
||||
*.ipr
|
||||
*.iws
|
||||
*~
|
||||
.vscode
|
||||
.metals
|
||||
|
@ -27,4 +25,4 @@ spark-warehouse
|
|||
/**/job-override.properties
|
||||
/**/*.log
|
||||
/**/.factorypath
|
||||
|
||||
/**/.scalafmt.conf
|
||||
|
|
|
@ -22,9 +22,20 @@
|
|||
<id>dnet45-releases</id>
|
||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
|
||||
</repository>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
<build>
|
||||
<extensions>
|
||||
<extension>
|
||||
<groupId>org.apache.maven.wagon</groupId>
|
||||
<artifactId>wagon-ssh</artifactId>
|
||||
<version>2.10</version>
|
||||
</extension>
|
||||
</extensions>
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
|
@ -35,7 +46,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-site-plugin</artifactId>
|
||||
<version>3.7.1</version>
|
||||
<version>3.9.1</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
|
@ -43,6 +54,7 @@
|
|||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
|
||||
</properties>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,21 @@
|
|||
style = defaultWithAlign
|
||||
|
||||
align.openParenCallSite = false
|
||||
align.openParenDefnSite = false
|
||||
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
|
||||
continuationIndent.callSite = 2
|
||||
continuationIndent.defnSite = 2
|
||||
danglingParentheses = true
|
||||
indentOperator = spray
|
||||
maxColumn = 120
|
||||
newlines.alwaysBeforeTopLevelStatements = true
|
||||
project.excludeFilters = [".*\\.sbt"]
|
||||
rewrite.rules = [AvoidInfix]
|
||||
rewrite.rules = [ExpandImportSelectors]
|
||||
rewrite.rules = [RedundantBraces]
|
||||
rewrite.rules = [RedundantParens]
|
||||
rewrite.rules = [SortImports]
|
||||
rewrite.rules = [SortModifiers]
|
||||
rewrite.rules = [PreferCurlyFors]
|
||||
spaces.inImportCurlyBraces = false
|
||||
unindentTopLevelOperators = true
|
|
@ -0,0 +1,21 @@
|
|||
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||
name="DHP-Aggregation">
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.8</version>
|
||||
</skin>
|
||||
<poweredBy>
|
||||
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||
</poweredBy>
|
||||
<body>
|
||||
<links>
|
||||
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||
</links>
|
||||
<menu ref="modules" />
|
||||
<menu ref="reports"/>
|
||||
</body>
|
||||
</project>
|
|
@ -10,6 +10,9 @@
|
|||
<packaging>pom</packaging>
|
||||
|
||||
<description>This module is a container for the build tools used in dnet-hadoop</description>
|
||||
<properties>
|
||||
<maven.javadoc.skip>true</maven.javadoc.skip>
|
||||
</properties>
|
||||
|
||||
<modules>
|
||||
<module>dhp-code-style</module>
|
||||
|
@ -17,4 +20,12 @@
|
|||
<module>dhp-build-properties-maven-plugin</module>
|
||||
</modules>
|
||||
|
||||
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>${dhp.site.stage.path}/dhp-build/</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||
name="DHP-Aggregation">
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.8</version>
|
||||
</skin>
|
||||
<poweredBy>
|
||||
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||
</poweredBy>
|
||||
<body>
|
||||
<links>
|
||||
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||
</links>
|
||||
|
||||
<menu ref="modules" />
|
||||
<menu ref="reports"/>
|
||||
</body>
|
||||
</project>
|
|
@ -13,7 +13,51 @@
|
|||
<artifactId>dhp-common</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>${dhp.site.stage.path}/dhp-common</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>${net.alchim31.maven.version}</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-doc</id>
|
||||
<phase>process-resources</phase> <!-- or wherever -->
|
||||
<goals>
|
||||
<goal>doc</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
|
||||
|
|
|
@ -1,413 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class GraphResultMapper implements Serializable {
|
||||
|
||||
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
|
||||
E in) {
|
||||
|
||||
CommunityResult out = new CommunityResult();
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||
if (ort.isPresent()) {
|
||||
switch (ort.get().getClassid()) {
|
||||
case "publication":
|
||||
Optional<Journal> journal = Optional
|
||||
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
|
||||
if (journal.isPresent()) {
|
||||
Journal j = journal.get();
|
||||
Container c = new Container();
|
||||
c.setConferencedate(j.getConferencedate());
|
||||
c.setConferenceplace(j.getConferenceplace());
|
||||
c.setEdition(j.getEdition());
|
||||
c.setEp(j.getEp());
|
||||
c.setIss(j.getIss());
|
||||
c.setIssnLinking(j.getIssnLinking());
|
||||
c.setIssnOnline(j.getIssnOnline());
|
||||
c.setIssnPrinted(j.getIssnPrinted());
|
||||
c.setName(j.getName());
|
||||
c.setSp(j.getSp());
|
||||
c.setVol(j.getVol());
|
||||
out.setContainer(c);
|
||||
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
|
||||
}
|
||||
break;
|
||||
case "dataset":
|
||||
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
|
||||
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
|
||||
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
|
||||
|
||||
out
|
||||
.setGeolocation(
|
||||
Optional
|
||||
.ofNullable(id.getGeolocation())
|
||||
.map(
|
||||
igl -> igl
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(gli -> {
|
||||
GeoLocation gl = new GeoLocation();
|
||||
gl.setBox(gli.getBox());
|
||||
gl.setPlace(gli.getPlace());
|
||||
gl.setPoint(gli.getPoint());
|
||||
return gl;
|
||||
})
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "software":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
|
||||
Optional
|
||||
.ofNullable(is.getCodeRepositoryUrl())
|
||||
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(is.getDocumentationUrl())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setDocumentationUrl(
|
||||
value
|
||||
.stream()
|
||||
.map(Field::getValue)
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(is.getProgrammingLanguage())
|
||||
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
|
||||
|
||||
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "other":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
|
||||
out
|
||||
.setContactgroup(
|
||||
Optional
|
||||
.ofNullable(ir.getContactgroup())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out
|
||||
.setContactperson(
|
||||
Optional
|
||||
.ofNullable(ir.getContactperson())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
out
|
||||
.setTool(
|
||||
Optional
|
||||
.ofNullable(ir.getTool())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getAuthor())
|
||||
.ifPresent(
|
||||
ats -> out.setAuthor(ats.stream().map(GraphResultMapper::getAuthor).collect(Collectors.toList())));
|
||||
|
||||
// I do not map Access Right UNKNOWN or OTHER
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
|
||||
if (oar.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
|
||||
out
|
||||
.setBestaccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
final List<String> contributorList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getContributor())
|
||||
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
|
||||
out.setContributor(contributorList);
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getCountry())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setCountry(
|
||||
value
|
||||
.stream()
|
||||
.map(
|
||||
c -> {
|
||||
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
|
||||
return null;
|
||||
}
|
||||
Country country = new Country();
|
||||
country.setCode(c.getClassid());
|
||||
country.setLabel(c.getClassname());
|
||||
Optional
|
||||
.ofNullable(c.getDataInfo())
|
||||
.ifPresent(
|
||||
provenance -> country
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
provenance
|
||||
.getProvenanceaction()
|
||||
.getClassname(),
|
||||
c.getDataInfo().getTrust())));
|
||||
return country;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
final List<String> coverageList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getCoverage())
|
||||
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
|
||||
out.setCoverage(coverageList);
|
||||
|
||||
out.setDateofcollection(input.getDateofcollection());
|
||||
|
||||
final List<String> descriptionList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getDescription())
|
||||
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
|
||||
out.setDescription(descriptionList);
|
||||
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
|
||||
if (oStr.isPresent()) {
|
||||
out.setEmbargoenddate(oStr.get().getValue());
|
||||
}
|
||||
|
||||
final List<String> formatList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getFormat())
|
||||
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
|
||||
out.setFormat(formatList);
|
||||
out.setId(input.getId());
|
||||
out.setOriginalId(input.getOriginalId());
|
||||
|
||||
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
|
||||
.ofNullable(input.getInstance());
|
||||
|
||||
if (oInst.isPresent()) {
|
||||
out
|
||||
.setInstance(
|
||||
oInst.get().stream().map(GraphResultMapper::getInstance).collect(Collectors.toList()));
|
||||
|
||||
}
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
|
||||
if (oL.isPresent()) {
|
||||
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
|
||||
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
|
||||
}
|
||||
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
|
||||
if (oLong.isPresent()) {
|
||||
out.setLastupdatetimestamp(oLong.get());
|
||||
}
|
||||
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||
if (otitle.isPresent()) {
|
||||
List<StructuredProperty> iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setMaintitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setSubtitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
List<ControlledField> pids = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getPid())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.stream()
|
||||
.forEach(
|
||||
p -> pids
|
||||
.add(
|
||||
ControlledField
|
||||
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
|
||||
out.setPid(pids);
|
||||
oStr = Optional.ofNullable(input.getDateofacceptance());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublicationdate(oStr.get().getValue());
|
||||
}
|
||||
oStr = Optional.ofNullable(input.getPublisher());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublisher(oStr.get().getValue());
|
||||
}
|
||||
|
||||
List<String> sourceList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSource())
|
||||
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
|
||||
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
|
||||
List<Subject> subjectList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSubject())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.forEach(s -> subjectList.add(getSubject(s))));
|
||||
|
||||
out.setSubjects(subjectList);
|
||||
|
||||
out.setType(input.getResulttype().getClassid());
|
||||
}
|
||||
|
||||
out
|
||||
.setCollectedfrom(
|
||||
input
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return out;
|
||||
|
||||
}
|
||||
|
||||
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||
CommunityInstance instance = new CommunityInstance();
|
||||
|
||||
setCommonValue(i, instance);
|
||||
|
||||
instance
|
||||
.setCollectedfrom(
|
||||
KeyValue
|
||||
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
||||
|
||||
instance
|
||||
.setHostedby(
|
||||
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
|
||||
|
||||
return instance;
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
|
||||
.ofNullable(i.getAccessright());
|
||||
if (opAr.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
|
||||
instance
|
||||
.setAccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(i.getLicense())
|
||||
.ifPresent(value -> instance.setLicense(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getDateofacceptance())
|
||||
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getRefereed())
|
||||
.ifPresent(value -> instance.setRefereed(value.getClassname()));
|
||||
Optional
|
||||
.ofNullable(i.getInstancetype())
|
||||
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||
|
||||
}
|
||||
|
||||
private static Subject getSubject(StructuredProperty s) {
|
||||
Subject subject = new Subject();
|
||||
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
|
||||
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
Provenance p = new Provenance();
|
||||
p.setProvenance(di.get().getProvenanceaction().getClassname());
|
||||
p.setTrust(di.get().getTrust());
|
||||
subject.setProvenance(p);
|
||||
}
|
||||
|
||||
return subject;
|
||||
}
|
||||
|
||||
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
|
||||
Author a = new Author();
|
||||
a.setFullname(oa.getFullname());
|
||||
a.setName(oa.getName());
|
||||
a.setSurname(oa.getSurname());
|
||||
a.setRank(oa.getRank());
|
||||
|
||||
Optional<List<StructuredProperty>> oPids = Optional
|
||||
.ofNullable(oa.getPid());
|
||||
if (oPids.isPresent()) {
|
||||
Pid pid = getOrcid(oPids.get());
|
||||
if (pid != null) {
|
||||
a.setPid(pid);
|
||||
}
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||
for (StructuredProperty pid : p) {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()),
|
||||
Provenance
|
||||
.newInstance(
|
||||
di.get().getProvenanceaction().getClassname(),
|
||||
di.get().getTrust()));
|
||||
} else {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue())
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -28,7 +28,7 @@ public class HdfsSupport {
|
|||
* @param configuration Configuration of hadoop env
|
||||
*/
|
||||
public static boolean exists(String path, Configuration configuration) {
|
||||
logger.info("Removing path: {}", path);
|
||||
logger.info("Checking existence for path: {}", path);
|
||||
return rethrowAsRuntimeException(
|
||||
() -> {
|
||||
Path f = new Path(path);
|
||||
|
|
|
@ -85,6 +85,13 @@ public class MakeTarArchive implements Serializable {
|
|||
String p_string = p.toString();
|
||||
if (!p_string.endsWith("_SUCCESS")) {
|
||||
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
if (name.startsWith("part-") & name.length() > 10) {
|
||||
String tmp = name.substring(0, 10);
|
||||
if (name.contains(".")) {
|
||||
tmp += name.substring(name.indexOf("."));
|
||||
}
|
||||
name = tmp;
|
||||
}
|
||||
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
|
||||
entry.setSize(fileStatus.getLen());
|
||||
current_size += fileStatus.getLen();
|
||||
|
|
|
@ -57,9 +57,17 @@ public class VocabularyGroup implements Serializable {
|
|||
final String syn = arr[2].trim();
|
||||
|
||||
vocs.addSynonyms(vocId, termId, syn);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// add the term names as synonyms
|
||||
vocs.vocs.values().forEach(voc -> {
|
||||
voc.getTerms().values().forEach(term -> {
|
||||
voc.addSynonym(term.getName().toLowerCase(), term.getId());
|
||||
});
|
||||
});
|
||||
|
||||
return vocs;
|
||||
}
|
||||
|
||||
|
|
|
@ -16,6 +16,8 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
@ -27,8 +29,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
public static final int ORCID_LEN = 19;
|
||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
||||
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
|
||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
|
||||
|
||||
public static final String TITLE_TEST = "test";
|
||||
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
|
||||
|
||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
||||
|
||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
|
@ -83,6 +88,22 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
|
||||
public static <T extends Oaf> boolean filter(T value) {
|
||||
if (Boolean.TRUE
|
||||
.equals(
|
||||
Optional
|
||||
.ofNullable(value)
|
||||
.map(
|
||||
o -> Optional
|
||||
.ofNullable(o.getDataInfo())
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getInvisible())
|
||||
.orElse(true))
|
||||
.orElse(true))
|
||||
.orElse(true))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Project) {
|
||||
|
@ -112,7 +133,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
return true;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T cleanup(T value) {
|
||||
public static <T extends Oaf> T cleanup(T value, VocabularyGroup vocs) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
|
@ -195,14 +216,29 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
final String title = sp
|
||||
.getValue()
|
||||
.toLowerCase();
|
||||
final String residual = Unidecode
|
||||
.decode(title)
|
||||
.replaceAll(TITLE_FILTER_REGEX, "");
|
||||
return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||
final String decoded = Unidecode.decode(title);
|
||||
|
||||
if (StringUtils.contains(decoded, TITLE_TEST)) {
|
||||
return decoded
|
||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||
}
|
||||
return !decoded
|
||||
.replaceAll("\\W|\\d", "")
|
||||
.isEmpty();
|
||||
})
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getFormat())) {
|
||||
r
|
||||
.setFormat(
|
||||
r
|
||||
.getFormat()
|
||||
.stream()
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getDescription())) {
|
||||
r
|
||||
.setDescription(
|
||||
|
@ -225,6 +261,38 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
if (Objects.nonNull(r.getInstance())) {
|
||||
|
||||
for (Instance i : r.getInstance()) {
|
||||
if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
|
||||
if (r instanceof Publication) {
|
||||
i
|
||||
.setInstancetype(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||
} else if (r instanceof Dataset) {
|
||||
i
|
||||
.setInstancetype(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||
} else if (r instanceof Software) {
|
||||
i
|
||||
.setInstancetype(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||
} else if (r instanceof OtherResearchProduct) {
|
||||
i
|
||||
.setInstancetype(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||
}
|
||||
}
|
||||
|
||||
if (Objects.nonNull(i.getPid())) {
|
||||
i.setPid(processPidCleaning(i.getPid()));
|
||||
}
|
||||
|
|
|
@ -4,19 +4,19 @@ package eu.dnetlib.dhp.utils;
|
|||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.security.MessageDigest;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.codec.binary.Base64OutputStream;
|
||||
import org.apache.commons.codec.binary.Hex;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -26,6 +26,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import com.google.common.collect.Maps;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import net.minidev.json.JSONArray;
|
||||
import scala.collection.JavaConverters;
|
||||
import scala.collection.Seq;
|
||||
|
@ -52,10 +54,56 @@ public class DHPUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves from the metadata store manager application the list of paths associated with mdstores characterized
|
||||
* by he given format, layout, interpretation
|
||||
* @param mdstoreManagerUrl the URL of the mdstore manager service
|
||||
* @param format the mdstore format
|
||||
* @param layout the mdstore layout
|
||||
* @param interpretation the mdstore interpretation
|
||||
* @param includeEmpty include Empty mdstores
|
||||
* @return the set of hdfs paths
|
||||
* @throws IOException in case of HTTP communication issues
|
||||
*/
|
||||
public static Set<String> mdstorePaths(final String mdstoreManagerUrl,
|
||||
final String format,
|
||||
final String layout,
|
||||
final String interpretation,
|
||||
boolean includeEmpty) throws IOException {
|
||||
final String url = mdstoreManagerUrl + "/mdstores/";
|
||||
final ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
final HttpGet req = new HttpGet(url);
|
||||
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
final String json = IOUtils.toString(response.getEntity().getContent());
|
||||
final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
|
||||
return Arrays
|
||||
.stream(mdstores)
|
||||
.filter(md -> md.getFormat().equalsIgnoreCase(format))
|
||||
.filter(md -> md.getLayout().equalsIgnoreCase(layout))
|
||||
.filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation))
|
||||
.filter(md -> StringUtils.isNotBlank(md.getHdfsPath()))
|
||||
.filter(md -> StringUtils.isNotBlank(md.getCurrentVersion()))
|
||||
.filter(md -> includeEmpty || md.getSize() > 0)
|
||||
.map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store")
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static String generateIdentifier(final String originalId, final String nsPrefix) {
|
||||
return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId));
|
||||
}
|
||||
|
||||
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
|
||||
|
||||
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
|
||||
|
||||
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
|
||||
}
|
||||
|
||||
public static String getJPathString(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
package eu.dnetlib.dhp.application
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
/** This is the main Interface SparkApplication
|
||||
* where all the Spark Scala class should inherit
|
||||
*/
|
||||
trait SparkScalaApplication {
|
||||
|
||||
/** This is the path in the classpath of the json
|
||||
* describes all the argument needed to run
|
||||
*/
|
||||
val propertyPath: String
|
||||
|
||||
/** Utility to parse the arguments using the
|
||||
* property json in the classpath identified from
|
||||
* the variable propertyPath
|
||||
*
|
||||
* @param args the list of arguments
|
||||
*/
|
||||
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
parser
|
||||
}
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
def run(): Unit
|
||||
}
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.slf4j.Logger
|
||||
|
||||
abstract class AbstractScalaApplication(
|
||||
val propertyPath: String,
|
||||
val args: Array[String],
|
||||
log: Logger
|
||||
) extends SparkScalaApplication {
|
||||
|
||||
var parser: ArgumentApplicationParser = null
|
||||
|
||||
var spark: SparkSession = null
|
||||
|
||||
def initialize(): SparkScalaApplication = {
|
||||
parser = parseArguments(args)
|
||||
spark = createSparkSession()
|
||||
this
|
||||
}
|
||||
|
||||
/** Utility for creating a spark session starting from parser
|
||||
*
|
||||
* @return a spark Session
|
||||
*/
|
||||
private def createSparkSession(): SparkSession = {
|
||||
require(parser != null)
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val master = parser.get("master")
|
||||
log.info(s"Creating Spark session: Master: $master")
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,442 @@
|
|||
package eu.dnetlib.dhp.sx.graph.scholix
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.sx.scholix._
|
||||
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
|
||||
object ScholixUtils extends Serializable {
|
||||
|
||||
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
|
||||
|
||||
val DATE_RELATION_KEY: String = "RelationDate"
|
||||
|
||||
case class RelationVocabulary(original: String, inverse: String) {}
|
||||
|
||||
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
|
||||
|
||||
val relations: Map[String, RelationVocabulary] = {
|
||||
val input = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
|
||||
)
|
||||
.mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
json.extract[Map[String, RelationVocabulary]]
|
||||
}
|
||||
|
||||
def extractRelationDate(relation: Relation): String = {
|
||||
|
||||
if (relation.getProperties == null || !relation.getProperties.isEmpty)
|
||||
null
|
||||
else {
|
||||
val date = relation.getProperties.asScala
|
||||
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
|
||||
.map(p => p.getValue)
|
||||
if (date.isDefined)
|
||||
date.get
|
||||
else
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
def extractRelationDate(summary: ScholixSummary): String = {
|
||||
|
||||
if (summary.getDate == null || summary.getDate.isEmpty)
|
||||
null
|
||||
else {
|
||||
summary.getDate.get(0)
|
||||
}
|
||||
}
|
||||
|
||||
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
|
||||
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
|
||||
|
||||
}
|
||||
|
||||
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||
}
|
||||
|
||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
||||
new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
|
||||
override def zero: RelatedEntities = null
|
||||
|
||||
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
|
||||
val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
|
||||
if (b == null)
|
||||
RelatedEntities(a._1, relatedDataset, relatedPublication)
|
||||
else
|
||||
RelatedEntities(
|
||||
a._1,
|
||||
b.relatedDataset + relatedDataset,
|
||||
b.relatedPublication + relatedPublication
|
||||
)
|
||||
}
|
||||
|
||||
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
|
||||
if (b1 != null && b2 != null)
|
||||
RelatedEntities(
|
||||
b1.id,
|
||||
b1.relatedDataset + b2.relatedDataset,
|
||||
b1.relatedPublication + b2.relatedPublication
|
||||
)
|
||||
else if (b1 != null)
|
||||
b1
|
||||
else
|
||||
b2
|
||||
}
|
||||
|
||||
override def finish(reduction: RelatedEntities): RelatedEntities = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
|
||||
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
}
|
||||
|
||||
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
|
||||
new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
|
||||
override def zero: Scholix = null
|
||||
|
||||
def scholix_complete(s: Scholix): Boolean = {
|
||||
if (s == null || s.getIdentifier == null) {
|
||||
false
|
||||
} else if (s.getSource == null || s.getTarget == null) {
|
||||
false
|
||||
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
|
||||
false
|
||||
else
|
||||
true
|
||||
}
|
||||
|
||||
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
|
||||
if (scholix_complete(b)) b else a._2
|
||||
}
|
||||
|
||||
override def merge(b1: Scholix, b2: Scholix): Scholix = {
|
||||
if (scholix_complete(b1)) b1 else b2
|
||||
}
|
||||
|
||||
override def finish(reduction: Scholix): Scholix = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
|
||||
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
}
|
||||
|
||||
def createInverseScholixRelation(scholix: Scholix): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
s.setPublisher(scholix.getPublisher)
|
||||
s.setLinkprovider(scholix.getLinkprovider)
|
||||
s.setRelationship(inverseRelationShip(scholix.getRelationship))
|
||||
s.setSource(scholix.getTarget)
|
||||
s.setTarget(scholix.getSource)
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
|
||||
}
|
||||
|
||||
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
||||
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
||||
new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
|
||||
}(collection.breakOut)
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
|
||||
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
|
||||
new ScholixEntityId(
|
||||
d.getDatasourceName,
|
||||
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
|
||||
)
|
||||
}(collection.breakOut)
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
|
||||
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
|
||||
|
||||
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
|
||||
new ScholixEntityId(
|
||||
c.getValue,
|
||||
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||
)
|
||||
}.toList
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
s.setPublisher(scholix.getPublisher)
|
||||
s.setLinkprovider(scholix.getLinkprovider)
|
||||
s.setRelationship(scholix.getRelationship)
|
||||
s.setSource(scholix.getSource)
|
||||
s.setTarget(generateScholixResourceFromSummary(target))
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
}
|
||||
|
||||
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
s.setPublisher(scholix.getPublisher)
|
||||
s.setLinkprovider(scholix.getLinkprovider)
|
||||
s.setRelationship(scholix.getRelationship)
|
||||
s.setSource(scholix.getSource)
|
||||
s.setTarget(target)
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
}
|
||||
|
||||
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
|
||||
val r = new ScholixResource
|
||||
r.setIdentifier(summaryObject.getLocalIdentifier)
|
||||
r.setDnetIdentifier(summaryObject.getId)
|
||||
|
||||
r.setObjectType(summaryObject.getTypology.toString)
|
||||
r.setObjectSubType(summaryObject.getSubType)
|
||||
|
||||
if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty)
|
||||
r.setTitle(summaryObject.getTitle.get(0))
|
||||
|
||||
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
|
||||
val l: List[ScholixEntityId] =
|
||||
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
|
||||
if (l.nonEmpty)
|
||||
r.setCreator(l.asJava)
|
||||
}
|
||||
|
||||
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
|
||||
r.setPublicationDate(summaryObject.getDate.get(0))
|
||||
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
|
||||
val plist: List[ScholixEntityId] =
|
||||
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
|
||||
|
||||
if (plist.nonEmpty)
|
||||
r.setPublisher(plist.asJava)
|
||||
}
|
||||
|
||||
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
|
||||
|
||||
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
|
||||
.map(c =>
|
||||
new ScholixCollectedFrom(
|
||||
new ScholixEntityId(
|
||||
c.getDatasourceName,
|
||||
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||
),
|
||||
"collected",
|
||||
"complete"
|
||||
)
|
||||
)
|
||||
.toList
|
||||
|
||||
if (l.nonEmpty)
|
||||
r.setCollectedFrom(l.asJava)
|
||||
|
||||
}
|
||||
r
|
||||
}
|
||||
|
||||
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
|
||||
if (relation == null || source == null)
|
||||
return null
|
||||
val s = new Scholix
|
||||
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
|
||||
if (l.isEmpty)
|
||||
l = extractCollectedFrom(source)
|
||||
if (l.isEmpty)
|
||||
return null
|
||||
s.setLinkprovider(l.asJava)
|
||||
var d = extractRelationDate(relation)
|
||||
if (d == null)
|
||||
d = source.getPublicationDate
|
||||
|
||||
s.setPublicationDate(d)
|
||||
|
||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||
s.setPublisher(source.getPublisher)
|
||||
}
|
||||
|
||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||
if (semanticRelation == null)
|
||||
return null
|
||||
s.setRelationship(
|
||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||
)
|
||||
s.setSource(source)
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
|
||||
|
||||
if (relation == null || source == null)
|
||||
return null
|
||||
|
||||
val s = new Scholix
|
||||
|
||||
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
|
||||
if (l.isEmpty)
|
||||
l = extractCollectedFrom(source)
|
||||
if (l.isEmpty)
|
||||
return null
|
||||
|
||||
s.setLinkprovider(l.asJava)
|
||||
|
||||
var d = extractRelationDate(relation)
|
||||
if (d == null)
|
||||
d = extractRelationDate(source)
|
||||
|
||||
s.setPublicationDate(d)
|
||||
|
||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||
val l: List[ScholixEntityId] = source.getPublisher.asScala
|
||||
.map { p =>
|
||||
new ScholixEntityId(p, null)
|
||||
}(collection.breakOut)
|
||||
|
||||
if (l.nonEmpty)
|
||||
s.setPublisher(l.asJava)
|
||||
}
|
||||
|
||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||
if (semanticRelation == null)
|
||||
return null
|
||||
s.setRelationship(
|
||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||
)
|
||||
s.setSource(generateScholixResourceFromSummary(source))
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
def findURLForPID(
|
||||
pidValue: List[StructuredProperty],
|
||||
urls: List[String]
|
||||
): List[(StructuredProperty, String)] = {
|
||||
pidValue.map { p =>
|
||||
val pv = p.getValue
|
||||
|
||||
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
||||
(p, r.orNull)
|
||||
}
|
||||
}
|
||||
|
||||
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
|
||||
if (r.getInstance() == null || r.getInstance().isEmpty)
|
||||
return List()
|
||||
r.getInstance()
|
||||
.asScala
|
||||
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
||||
.filter(i => i.getPid != null && i.getUrl != null)
|
||||
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
|
||||
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
|
||||
.distinct
|
||||
.toList
|
||||
}
|
||||
|
||||
def resultToSummary(r: Result): ScholixSummary = {
|
||||
val s = new ScholixSummary
|
||||
s.setId(r.getId)
|
||||
if (r.getPid == null || r.getPid.isEmpty)
|
||||
return null
|
||||
|
||||
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r)
|
||||
if (persistentIdentifiers.isEmpty)
|
||||
return null
|
||||
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
||||
if (r.isInstanceOf[Publication])
|
||||
s.setTypology(Typology.publication)
|
||||
else
|
||||
s.setTypology(Typology.dataset)
|
||||
|
||||
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
||||
|
||||
if (r.getTitle != null && r.getTitle.asScala.nonEmpty) {
|
||||
val titles: List[String] = r.getTitle.asScala.map(t => t.getValue).toList
|
||||
if (titles.nonEmpty)
|
||||
s.setTitle(titles.asJava)
|
||||
else
|
||||
return null
|
||||
}
|
||||
|
||||
if (r.getAuthor != null && !r.getAuthor.isEmpty) {
|
||||
val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList
|
||||
if (authors.nonEmpty)
|
||||
s.setAuthor(authors.asJava)
|
||||
}
|
||||
if (r.getInstance() != null) {
|
||||
val dt: List[String] = r
|
||||
.getInstance()
|
||||
.asScala
|
||||
.filter(i => i.getDateofacceptance != null)
|
||||
.map(i => i.getDateofacceptance.getValue)
|
||||
.toList
|
||||
if (dt.nonEmpty)
|
||||
s.setDate(dt.distinct.asJava)
|
||||
}
|
||||
if (r.getDescription != null && !r.getDescription.isEmpty) {
|
||||
val d = r.getDescription.asScala.find(f => f != null && f.getValue != null)
|
||||
if (d.isDefined)
|
||||
s.setDescription(d.get.getValue)
|
||||
}
|
||||
|
||||
if (r.getSubject != null && !r.getSubject.isEmpty) {
|
||||
val subjects: List[SchemeValue] = r.getSubject.asScala
|
||||
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
|
||||
.toList
|
||||
if (subjects.nonEmpty)
|
||||
s.setSubject(subjects.asJava)
|
||||
}
|
||||
|
||||
if (r.getPublisher != null)
|
||||
s.setPublisher(List(r.getPublisher.getValue).asJava)
|
||||
|
||||
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
|
||||
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
|
||||
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
|
||||
.toList
|
||||
if (cf.nonEmpty)
|
||||
s.setDatasources(cf.distinct.asJava)
|
||||
}
|
||||
|
||||
s.setRelatedDatasets(0)
|
||||
s.setRelatedPublications(0)
|
||||
s.setRelatedUnknown(0)
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
}
|
|
@ -107,7 +107,7 @@ class OafMapperUtilsTest {
|
|||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
|
||||
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
|
||||
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
|
||||
|
|
|
@ -107,7 +107,7 @@
|
|||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=2560
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
</spark-opts>
|
||||
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
|
@ -159,7 +159,7 @@
|
|||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=2560
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
</spark-opts>
|
||||
<arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
|
|
|
@ -99,7 +99,7 @@
|
|||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=2560
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
</spark-opts>
|
||||
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||
|
|
|
@ -29,6 +29,13 @@
|
|||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-doc</id>
|
||||
<phase>process-resources</phase> <!-- or wherever -->
|
||||
<goals>
|
||||
<goal>doc</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
public class Constants {
|
||||
|
||||
public static final String DOI = "doi";
|
||||
public static final String DOI_CLASSNAME = "Digital Object Identifier";
|
||||
|
||||
public static final String DEFAULT_DELIMITER = ",";
|
||||
|
||||
public static final String UPDATE_DATA_INFO_TYPE = "update";
|
||||
public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
|
||||
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
|
||||
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
|
||||
public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg";
|
||||
|
||||
public static final String FOS_CLASS_ID = "FOS";
|
||||
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
|
||||
|
||||
public static final String SDG_CLASS_ID = "SDG";
|
||||
public static final String SDG_CLASS_NAME = "Sustainable Development Goals";
|
||||
|
||||
public static final String NULL = "NULL";
|
||||
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private Constants() {
|
||||
}
|
||||
|
||||
public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
|
||||
return Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
public static StructuredProperty getSubject(String sbj, String classid, String classname,
|
||||
String diqualifierclassid) {
|
||||
if (sbj.equals(NULL))
|
||||
return null;
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(sbj);
|
||||
sp
|
||||
.setQualifier(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
classid,
|
||||
classname,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
|
||||
sp
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
UPDATE_DATA_INFO_TYPE,
|
||||
true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
diqualifierclassid,
|
||||
UPDATE_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
""));
|
||||
|
||||
return sp;
|
||||
|
||||
}
|
||||
}
|
|
@ -1,86 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
/**
|
||||
* Just collects all the atomic actions produced for the different results and saves them in
|
||||
* outputpath for the ActionSet
|
||||
*/
|
||||
public class CollectAndSave implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
CollectAndSave.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json")));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath {}: ", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
collectAndSave(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
sc
|
||||
.sequenceFile(inputPath + "/publication", Text.class, Text.class)
|
||||
.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Subset of the information of the generic results that are needed to create the atomic action
|
||||
*/
|
||||
public class PreparedResult implements Serializable {
|
||||
private String id; // openaire id
|
||||
private String value; // doi
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -15,7 +16,6 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
@ -24,11 +24,15 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
|
||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
|
@ -46,7 +50,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
.toString(
|
||||
SparkAtomicActionScoreJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json"));
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
|
@ -65,14 +69,6 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
final String bipScorePath = parser.get("bipScorePath");
|
||||
log.info("bipScorePath: {}", bipScorePath);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
|
@ -80,12 +76,11 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
prepareResults(spark, inputPath, outputPath, bipScorePath, inputClazz);
|
||||
prepareResults(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
|
||||
String bipScorePath, Class<I> inputClazz) {
|
||||
private static <I extends Result> void prepareResults(SparkSession spark, String bipScorePath, String outputPath) {
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
|
@ -101,41 +96,19 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
return bs;
|
||||
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
|
||||
|
||||
Dataset<I> results = readPath(spark, inputPath, inputClazz);
|
||||
|
||||
results.createOrReplaceTempView("result");
|
||||
|
||||
Dataset<PreparedResult> preparedResult = spark
|
||||
.sql(
|
||||
"select pIde.value value, id " +
|
||||
"from result " +
|
||||
"lateral view explode (pid) p as pIde " +
|
||||
"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
|
||||
.as(Encoders.bean(PreparedResult.class));
|
||||
|
||||
bipScores
|
||||
.joinWith(
|
||||
preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
|
||||
"inner")
|
||||
.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
|
||||
BipScore ret = value._1();
|
||||
ret.setId(value._2().getId());
|
||||
return ret;
|
||||
}, Encoders.bean(BipScore.class))
|
||||
.groupByKey((MapFunction<BipScore, String>) BipScore::getId, Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
|
||||
Result ret = new Result();
|
||||
ret.setDataInfo(getDataInfo());
|
||||
BipScore first = it.next();
|
||||
ret.setId(first.getId());
|
||||
|
||||
ret.setMeasures(getMeasure(first));
|
||||
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
|
||||
.map((MapFunction<BipScore, Result>) bs -> {
|
||||
Result ret = new Result();
|
||||
|
||||
ret.setId(bs.getId());
|
||||
|
||||
ret.setMeasures(getMeasure(bs));
|
||||
|
||||
return ret;
|
||||
}, Encoders.bean(Result.class))
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(inputClazz, p))
|
||||
.map(p -> new AtomicAction(Result.class, p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
|
@ -159,7 +132,21 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
KeyValue kv = new KeyValue();
|
||||
kv.setValue(unit.getValue());
|
||||
kv.setKey(unit.getKey());
|
||||
kv.setDataInfo(getDataInfo());
|
||||
kv
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
UPDATE_DATA_INFO_TYPE,
|
||||
true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
UPDATE_MEASURE_BIP_CLASS_ID,
|
||||
UPDATE_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
""));
|
||||
return kv;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
@ -168,21 +155,6 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static DataInfo getDataInfo() {
|
||||
DataInfo di = new DataInfo();
|
||||
di.setInferred(false);
|
||||
di.setInvisible(false);
|
||||
di.setDeletedbyinference(false);
|
||||
di.setTrust("");
|
||||
Qualifier qualifier = new Qualifier();
|
||||
qualifier.setClassid("sysimport:actionset");
|
||||
qualifier.setClassname("Harvested");
|
||||
qualifier.setSchemename("dnet:provenanceActions");
|
||||
qualifier.setSchemeid("dnet:provenanceActions");
|
||||
di.setProvenanceaction(qualifier);
|
||||
return di;
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
|
@ -0,0 +1,91 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class GetFOSSparkJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GetFOSSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
GetFOSSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
// the path where the original fos csv file is stored
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath {}", sourcePath);
|
||||
|
||||
// the path where to put the file as json
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
sconf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
getFOS(
|
||||
spark,
|
||||
sourcePath,
|
||||
outputPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void getFOS(SparkSession spark, String sourcePath, String outputPath, String delimiter) {
|
||||
Dataset<Row> fosData = spark
|
||||
.read()
|
||||
.format("csv")
|
||||
.option("sep", delimiter)
|
||||
.option("inferSchema", "true")
|
||||
.option("header", "true")
|
||||
.option("quotes", "\"")
|
||||
.load(sourcePath);
|
||||
|
||||
fosData.map((MapFunction<Row, FOSDataModel>) r -> {
|
||||
FOSDataModel fosDataModel = new FOSDataModel();
|
||||
fosDataModel.setDoi(r.getString(0).toLowerCase());
|
||||
fosDataModel.setLevel1(r.getString(1));
|
||||
fosDataModel.setLevel2(r.getString(2));
|
||||
fosDataModel.setLevel3(r.getString(3));
|
||||
return fosDataModel;
|
||||
}, Encoders.bean(FOSDataModel.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class GetSDGSparkJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GetSDGSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
GetSDGSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
// the path where the original fos csv file is stored
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath {}", sourcePath);
|
||||
|
||||
// the path where to put the file as json
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
sconf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
getSDG(
|
||||
spark,
|
||||
sourcePath,
|
||||
outputPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void getSDG(SparkSession spark, String sourcePath, String outputPath, String delimiter) {
|
||||
Dataset<Row> sdgData = spark
|
||||
.read()
|
||||
.format("csv")
|
||||
.option("sep", delimiter)
|
||||
.option("inferSchema", "true")
|
||||
.option("header", "true")
|
||||
.option("quotes", "\"")
|
||||
.load(sourcePath);
|
||||
|
||||
sdgData.map((MapFunction<Row, SDGDataModel>) r -> {
|
||||
SDGDataModel sdgDataModel = new SDGDataModel();
|
||||
sdgDataModel.setDoi(r.getString(0).toLowerCase());
|
||||
sdgDataModel.setSbj(r.getString(1));
|
||||
|
||||
return sdgDataModel;
|
||||
}, Encoders.bean(SDGDataModel.class))
|
||||
.filter((FilterFunction<SDGDataModel>) sdg -> sdg.getSbj() != null)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,178 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.UPDATE_CLASS_NAME;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
|
||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Measure;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class PrepareBipFinder implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareBipFinder.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath {}: ", sourcePath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
||||
prepareResults(spark, sourcePath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void prepareResults(SparkSession spark, String inputPath, String outputPath) {
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
|
||||
.textFile(inputPath)
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
|
||||
|
||||
spark
|
||||
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
|
||||
BipScore bs = new BipScore();
|
||||
bs.setId(key);
|
||||
bs.setScoreList(entry.get(key));
|
||||
|
||||
return bs;
|
||||
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
|
||||
.map((MapFunction<BipScore, Result>) v -> {
|
||||
Result r = new Result();
|
||||
final String cleanedPid = CleaningFunctions.normalizePidValue(DOI, v.getId());
|
||||
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI));
|
||||
Instance inst = new Instance();
|
||||
inst.setMeasures(getMeasure(v));
|
||||
|
||||
inst
|
||||
.setPid(
|
||||
Arrays
|
||||
.asList(
|
||||
OafMapperUtils
|
||||
.structuredProperty(
|
||||
cleanedPid,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
DOI, DOI_CLASSNAME,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES),
|
||||
null)));
|
||||
r.setInstance(Arrays.asList(inst));
|
||||
r
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.PROVENANCE_ENRICH,
|
||||
null,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
null));
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/bip");
|
||||
}
|
||||
|
||||
private static List<Measure> getMeasure(BipScore value) {
|
||||
return value
|
||||
.getScoreList()
|
||||
.stream()
|
||||
.map(score -> {
|
||||
Measure m = new Measure();
|
||||
m.setId(score.getId());
|
||||
m
|
||||
.setUnit(
|
||||
score
|
||||
.getUnit()
|
||||
.stream()
|
||||
.map(unit -> {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setValue(unit.getValue());
|
||||
kv.setKey(unit.getKey());
|
||||
kv
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
UPDATE_DATA_INFO_TYPE,
|
||||
true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
UPDATE_MEASURE_BIP_CLASS_ID,
|
||||
UPDATE_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
""));
|
||||
return kv;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
return m;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class PrepareFOSSparkJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareFOSSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath: {}", sourcePath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
distributeFOSdois(
|
||||
spark,
|
||||
sourcePath,
|
||||
|
||||
outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void distributeFOSdois(SparkSession spark, String sourcePath, String outputPath) {
|
||||
Dataset<FOSDataModel> fosDataset = readPath(spark, sourcePath, FOSDataModel.class);
|
||||
|
||||
fosDataset
|
||||
.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getDoi().toLowerCase(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> {
|
||||
Result r = new Result();
|
||||
FOSDataModel first = it.next();
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
|
||||
HashSet<String> level1 = new HashSet<>();
|
||||
HashSet<String> level2 = new HashSet<>();
|
||||
HashSet<String> level3 = new HashSet<>();
|
||||
addLevels(level1, level2, level3, first);
|
||||
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
|
||||
List<StructuredProperty> sbjs = new ArrayList<>();
|
||||
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
r.setSubject(sbjs);
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/fos");
|
||||
}
|
||||
|
||||
private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3,
|
||||
FOSDataModel first) {
|
||||
level1.add(first.getLevel1());
|
||||
level2.add(first.getLevel2());
|
||||
level3.add(first.getLevel3());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class PrepareSDGSparkJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareSDGSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareSDGSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath: {}", sourcePath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
doPrepare(
|
||||
spark,
|
||||
sourcePath,
|
||||
|
||||
outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void doPrepare(SparkSession spark, String sourcePath, String outputPath) {
|
||||
Dataset<SDGDataModel> sdgDataset = readPath(spark, sourcePath, SDGDataModel.class);
|
||||
|
||||
sdgDataset
|
||||
.groupByKey((MapFunction<SDGDataModel, String>) r -> r.getDoi().toLowerCase(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, SDGDataModel, Result>) (k, it) -> {
|
||||
Result r = new Result();
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
|
||||
SDGDataModel first = it.next();
|
||||
List<StructuredProperty> sbjs = new ArrayList<>();
|
||||
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
|
||||
it
|
||||
.forEachRemaining(
|
||||
s -> sbjs
|
||||
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
|
||||
r.setSubject(sbjs);
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/sdg");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class SparkSaveUnresolved implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSaveUnresolved.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareFOSSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath: {}", sourcePath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
saveUnresolved(
|
||||
spark,
|
||||
sourcePath,
|
||||
|
||||
outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void saveUnresolved(SparkSession spark, String sourcePath, String outputPath) {
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(sourcePath + "/*")
|
||||
.map(
|
||||
(MapFunction<String, Result>) l -> OBJECT_MAPPER.readValue(l, Result.class),
|
||||
Encoders.bean(Result.class))
|
||||
.groupByKey((MapFunction<Result, String>) Result::getId, Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
|
||||
Result ret = it.next();
|
||||
it.forEachRemaining(r -> ret.mergeFrom(r));
|
||||
return ret;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByPosition;
|
||||
|
||||
public class FOSDataModel implements Serializable {
|
||||
@CsvBindByPosition(position = 0)
|
||||
// @CsvBindByName(column = "doi")
|
||||
private String doi;
|
||||
|
||||
@CsvBindByPosition(position = 1)
|
||||
// @CsvBindByName(column = "level1")
|
||||
private String level1;
|
||||
|
||||
@CsvBindByPosition(position = 2)
|
||||
// @CsvBindByName(column = "level2")
|
||||
private String level2;
|
||||
|
||||
@CsvBindByPosition(position = 3)
|
||||
// @CsvBindByName(column = "level3")
|
||||
private String level3;
|
||||
|
||||
public FOSDataModel() {
|
||||
|
||||
}
|
||||
|
||||
public FOSDataModel(String doi, String level1, String level2, String level3) {
|
||||
this.doi = doi;
|
||||
this.level1 = level1;
|
||||
this.level2 = level2;
|
||||
this.level3 = level3;
|
||||
}
|
||||
|
||||
public static FOSDataModel newInstance(String d, String level1, String level2, String level3) {
|
||||
return new FOSDataModel(d, level1, level2, level3);
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getLevel1() {
|
||||
return level1;
|
||||
}
|
||||
|
||||
public void setLevel1(String level1) {
|
||||
this.level1 = level1;
|
||||
}
|
||||
|
||||
public String getLevel2() {
|
||||
return level2;
|
||||
}
|
||||
|
||||
public void setLevel2(String level2) {
|
||||
this.level2 = level2;
|
||||
}
|
||||
|
||||
public String getLevel3() {
|
||||
return level3;
|
||||
}
|
||||
|
||||
public void setLevel3(String level3) {
|
||||
this.level3 = level3;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByPosition;
|
||||
|
||||
public class SDGDataModel implements Serializable {
|
||||
|
||||
@CsvBindByPosition(position = 0)
|
||||
// @CsvBindByName(column = "doi")
|
||||
private String doi;
|
||||
|
||||
@CsvBindByPosition(position = 1)
|
||||
// @CsvBindByName(column = "sdg")
|
||||
private String sbj;
|
||||
|
||||
public SDGDataModel() {
|
||||
|
||||
}
|
||||
|
||||
public SDGDataModel(String doi, String sbj) {
|
||||
this.doi = doi;
|
||||
this.sbj = sbj;
|
||||
|
||||
}
|
||||
|
||||
public static SDGDataModel newInstance(String d, String sbj) {
|
||||
return new SDGDataModel(d, sbj);
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getSbj() {
|
||||
return sbj;
|
||||
}
|
||||
|
||||
public void setSbj(String sbj) {
|
||||
this.sbj = sbj;
|
||||
}
|
||||
}
|
|
@ -1,607 +0,0 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import java.nio.charset.CodingErrorAction
|
||||
import java.text.SimpleDateFormat
|
||||
import java.time.LocalDate
|
||||
import java.time.chrono.ThaiBuddhistDate
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.regex.Pattern
|
||||
import java.util.{Date, Locale}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.{Codec, Source}
|
||||
import scala.language.postfixOps
|
||||
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
|
||||
|
||||
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
|
||||
|
||||
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
|
||||
|
||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||
|
||||
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
||||
|
||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||
|
||||
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
|
||||
|
||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||
|
||||
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
|
||||
|
||||
object DataciteToOAFTransformation {
|
||||
|
||||
val REL_TYPE_VALUE:String = "resultResult"
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
|
||||
val subRelTypeMapping: Map[String,(String,String)] = Map(
|
||||
"References" ->("IsReferencedBy","relationship"),
|
||||
"IsSupplementTo" ->("IsSupplementedBy","supplement"),
|
||||
"IsPartOf" ->("HasPart","part"),
|
||||
"HasPart" ->("IsPartOf","part"),
|
||||
"IsVersionOf" ->("HasVersion","version"),
|
||||
"HasVersion" ->("IsVersionOf","version"),
|
||||
"IsIdenticalTo" ->("IsIdenticalTo","relationship"),
|
||||
"IsPreviousVersionOf" ->("IsNewVersionOf","version"),
|
||||
"IsContinuedBy" ->("Continues","relationship"),
|
||||
"Continues" ->("IsContinuedBy","relationship"),
|
||||
"IsNewVersionOf" ->("IsPreviousVersionOf","version"),
|
||||
"IsSupplementedBy" ->("IsSupplementTo","supplement"),
|
||||
"IsDocumentedBy" ->("Documents","relationship"),
|
||||
"IsSourceOf" ->("IsDerivedFrom","relationship"),
|
||||
"Cites" ->("IsCitedBy","citation"),
|
||||
"IsCitedBy" ->("Cites","citation"),
|
||||
"IsDerivedFrom" ->("IsSourceOf","relationship"),
|
||||
"IsVariantFormOf" ->("IsDerivedFrom","version"),
|
||||
"IsReferencedBy" ->("References","relationship"),
|
||||
"IsObsoletedBy" ->("IsNewVersionOf","version"),
|
||||
"Reviews" ->("IsReviewedBy","review"),
|
||||
"Documents" ->("IsDocumentedBy","relationship"),
|
||||
"IsCompiledBy" ->("Compiles","relationship"),
|
||||
"Compiles" ->("IsCompiledBy","relationship"),
|
||||
"IsReviewedBy" ->("Reviews","review")
|
||||
)
|
||||
|
||||
implicit val codec: Codec = Codec("UTF-8")
|
||||
codec.onMalformedInput(CodingErrorAction.REPLACE)
|
||||
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
|
||||
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
|
||||
|
||||
val j_filter: List[String] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
|
||||
s.lines.toList
|
||||
}
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
|
||||
|
||||
val dataInfo: DataInfo = generateDataInfo("0.9")
|
||||
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
|
||||
|
||||
val hostedByMap: Map[String, HostedByMapType] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(s)
|
||||
json.extract[Map[String, HostedByMapType]]
|
||||
}
|
||||
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
|
||||
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
|
||||
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||
//M-D-Y
|
||||
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
||||
//D-M-Y
|
||||
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
)
|
||||
|
||||
|
||||
def filter_json(json: String): Boolean = {
|
||||
j_filter.exists(f => json.contains(f))
|
||||
}
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
def embargo_end(embargo_end_date: String): Boolean = {
|
||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||
val td = LocalDate.now()
|
||||
td.isAfter(dt)
|
||||
}
|
||||
|
||||
|
||||
def extract_date(input: String): Option[String] = {
|
||||
val d = Date_regex.map(pattern => {
|
||||
val matcher = pattern.matcher(input)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
}
|
||||
).find(s => s != null)
|
||||
|
||||
if (d.isDefined) {
|
||||
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_en).toString)
|
||||
} catch {
|
||||
case _: Throwable => try {
|
||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
return None
|
||||
}
|
||||
}
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
def fix_thai_date(input:String, format:String) :String = {
|
||||
try {
|
||||
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
|
||||
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
|
||||
LocalDate.from(d).toString
|
||||
} catch {
|
||||
case _: Throwable => ""
|
||||
}
|
||||
}
|
||||
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
|
||||
if (resourceType != null && resourceType.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
}
|
||||
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
|
||||
}
|
||||
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
|
||||
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (typeQualifiers == null)
|
||||
return null
|
||||
val i = new Instance
|
||||
i.setInstancetype(typeQualifiers._1)
|
||||
typeQualifiers._2.getClassname match {
|
||||
case "dataset" =>
|
||||
val r = new OafDataset
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "publication" =>
|
||||
val r = new Publication
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "software" =>
|
||||
val r = new Software
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "other" =>
|
||||
val r = new OtherResearchProduct
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def available_date(input: String): Boolean = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
val l: List[String] = for {
|
||||
JObject(dates) <- json \\ "dates"
|
||||
JField("dateType", JString(dateTypes)) <- dates
|
||||
} yield dateTypes
|
||||
|
||||
l.exists(p => p.equalsIgnoreCase("available"))
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* As describe in ticket #6377
|
||||
* when the result come from figshare we need to remove subject
|
||||
* and set Access rights OPEN.
|
||||
*
|
||||
* @param r
|
||||
*/
|
||||
def fix_figshare(r: Result): Unit = {
|
||||
|
||||
if (r.getInstance() != null) {
|
||||
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
if (hosted_by_figshare) {
|
||||
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
||||
val l: List[StructuredProperty] = List()
|
||||
r.setSubject(l.asJava)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
||||
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
||||
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
||||
}
|
||||
|
||||
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
||||
OafMapperUtils.structuredProperty(dt, q, null)
|
||||
}
|
||||
|
||||
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
|
||||
|
||||
val r = new Relation
|
||||
r.setSource(sourceId)
|
||||
r.setTarget(targetId)
|
||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
||||
r.setRelClass(relClass)
|
||||
r.setSubRelType(ModelConstants.OUTCOME)
|
||||
r.setCollectedfrom(List(cf).asJava)
|
||||
r.setDataInfo(di)
|
||||
r
|
||||
|
||||
|
||||
}
|
||||
|
||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
||||
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
|
||||
|
||||
if (match_pattern.isDefined) {
|
||||
val m = match_pattern.get._1
|
||||
val p = match_pattern.get._2
|
||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||
List(
|
||||
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
|
||||
generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
|
||||
)
|
||||
}
|
||||
else
|
||||
List()
|
||||
|
||||
}
|
||||
|
||||
|
||||
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
|
||||
if (filter_json(input))
|
||||
return List()
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
||||
|
||||
val doi = (json \ "attributes" \ "doi").extract[String]
|
||||
if (doi.isEmpty)
|
||||
return List()
|
||||
|
||||
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
|
||||
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (result == null)
|
||||
return List()
|
||||
|
||||
|
||||
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
|
||||
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||
result.setPid(List(pid).asJava)
|
||||
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||
result.setOriginalId(List(doi).asJava)
|
||||
|
||||
val d = new Date(dateOfCollection * 1000)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
|
||||
|
||||
result.setDateofcollection(ISO8601FORMAT.format(d))
|
||||
result.setDateoftransformation(ISO8601FORMAT.format(d))
|
||||
result.setDataInfo(dataInfo)
|
||||
|
||||
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
||||
|
||||
|
||||
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
||||
val a = new Author
|
||||
a.setFullname(c.name.orNull)
|
||||
a.setName(c.givenName.orNull)
|
||||
a.setSurname(c.familyName.orNull)
|
||||
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||
a.setPid(c.nameIdentifiers.get.map(ni => {
|
||||
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
|
||||
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
||||
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||
}
|
||||
else
|
||||
null
|
||||
|
||||
}
|
||||
)
|
||||
.asJava)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
|
||||
a.setRank(idx + 1)
|
||||
a
|
||||
}
|
||||
|
||||
|
||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||
|
||||
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
|
||||
if (t.titleType.isEmpty) {
|
||||
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
||||
} else {
|
||||
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
|
||||
}
|
||||
}).asJava)
|
||||
|
||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||
return List()
|
||||
result.setAuthor(authors.asJava)
|
||||
|
||||
val dates = (json \\ "dates").extract[List[DateType]]
|
||||
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
|
||||
|
||||
val i_date = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
val a_date: Option[String] = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
.find(d => d != null && d.isDefined)
|
||||
.map(d => d.get)
|
||||
|
||||
if (a_date.isDefined) {
|
||||
if(doi.startsWith("10.14457"))
|
||||
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
|
||||
else
|
||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||
}
|
||||
if (i_date.isDefined && i_date.get.isDefined) {
|
||||
if(doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
|
||||
}
|
||||
else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
}
|
||||
}
|
||||
else if (publication_year != null) {
|
||||
if(doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
|
||||
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||
.filter(d => d._1.isDefined)
|
||||
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
|
||||
.filter(d => d._2 != null)
|
||||
.map(d => generateOAFDate(d._1, d._2)).asJava)
|
||||
|
||||
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
||||
|
||||
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
||||
).asJava)
|
||||
|
||||
|
||||
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
|
||||
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
||||
|
||||
result.setDescription(
|
||||
descriptions
|
||||
.filter(d => d.description.isDefined).
|
||||
map(d =>
|
||||
OafMapperUtils.field(d.description.get, null)
|
||||
).filter(s => s != null).asJava)
|
||||
|
||||
|
||||
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
||||
if (publisher != null)
|
||||
result.setPublisher(OafMapperUtils.field(publisher, null))
|
||||
|
||||
|
||||
val language: String = (json \\ "language").extractOrElse[String](null)
|
||||
|
||||
if (language != null)
|
||||
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
|
||||
|
||||
|
||||
val instance = result.getInstance().get(0)
|
||||
|
||||
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
|
||||
|
||||
val accessRights: List[String] = for {
|
||||
JObject(rightsList) <- json \\ "rightsList"
|
||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||
} yield rightsUri
|
||||
|
||||
val aRights: Option[AccessRight] = accessRights.map(r => {
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||
}).find(q => q != null).map(q => {
|
||||
val a = new AccessRight
|
||||
a.setClassid(q.getClassid)
|
||||
a.setClassname(q.getClassname)
|
||||
a.setSchemeid(q.getSchemeid)
|
||||
a.setSchemename(q.getSchemename)
|
||||
a
|
||||
})
|
||||
|
||||
|
||||
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
|
||||
if (client.isDefined) {
|
||||
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
|
||||
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
|
||||
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
||||
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
||||
instance.setAccessright(access_rights_qualifier)
|
||||
instance.setPid(result.getPid)
|
||||
val license = accessRights
|
||||
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
|
||||
if (license.isDefined)
|
||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||
}
|
||||
|
||||
val awardUris: List[String] = for {
|
||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||
} yield awardUri
|
||||
|
||||
result.setId(IdentifierFactory.createIdentifier(result))
|
||||
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
|
||||
fix_figshare(result)
|
||||
|
||||
if (result.getId == null)
|
||||
return List()
|
||||
|
||||
if (exportLinks) {
|
||||
val rels: List[RelatedIdentifierType] = for {
|
||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
||||
JField("relationType", JString(relationType)) <- relIdentifier
|
||||
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
|
||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
||||
|
||||
relations = relations ::: generateRelations(rels,result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
|
||||
}
|
||||
if (relations != null && relations.nonEmpty) {
|
||||
List(result) ::: relations
|
||||
}
|
||||
else
|
||||
List(result)
|
||||
}
|
||||
|
||||
private def generateRelations(rels: List[RelatedIdentifierType], id:String, date:String):List[Relation] = {
|
||||
rels
|
||||
.filter(r =>
|
||||
subRelTypeMapping.contains(r.relationType) && (
|
||||
r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||
)
|
||||
.map(r => {
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.setDataInfo(dataInfo)
|
||||
|
||||
val subRelType = subRelTypeMapping(r.relationType)._2
|
||||
rel.setRelType(REL_TYPE_VALUE)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(r.relationType)
|
||||
|
||||
val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.setSource(id)
|
||||
rel.setTarget(s"unresolved::${r.relatedIdentifier}::${r.relatedIdentifierType}")
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.getCollectedfrom.asScala.map(c => c.getValue)(collection.breakOut)
|
||||
rel
|
||||
})(collection breakOut)
|
||||
}
|
||||
|
||||
def generateDataInfo(trust: String): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
|
||||
di
|
||||
}
|
||||
|
||||
def generateDSId(input: String): String = {
|
||||
val b = StringUtils.substringBefore(input, "::")
|
||||
val a = StringUtils.substringAfter(input, "::")
|
||||
s"10|$b::${DHPUtils.md5(a)}"
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,41 +0,0 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object ExportActionSetJobNode {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(ExportActionSetJobNode.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
|
||||
|
||||
spark.read.load(sourcePath).as[Oaf]
|
||||
.map(o =>DataciteToOAFTransformation.toActionSet(o))
|
||||
.filter(o => o!= null)
|
||||
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,46 +0,0 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object FilterCrossrefEntitiesSpark {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(getClass.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info("sourcePath: {}", sourcePath)
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info("targetPath: {}", targetPath)
|
||||
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
||||
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||
|
||||
val d:Dataset[Oaf]= spark.read.load(sourcePath).as[Oaf]
|
||||
|
||||
d.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object GenerateDataciteDatasetSpark {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
||||
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
|
||||
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
spark.read.load(sourcePath).as[DataciteType]
|
||||
.filter(d => d.isActive)
|
||||
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
|
||||
.filter(d => d != null)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,181 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class CreateActionSetSparkJob implements Serializable {
|
||||
public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations";
|
||||
public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations";
|
||||
private static final String ID_PREFIX = "50|doi_________::";
|
||||
private static final String TRUST = "0.91";
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws IOException, ParseException {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
CreateActionSetSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json"))));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath {}", inputPath.toString());
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final boolean shouldDuplicateRels = Optional
|
||||
.ofNullable(parser.get("shouldDuplicateRels"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.FALSE);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
extractContent(spark, inputPath, outputPath, shouldDuplicateRels);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
|
||||
boolean shouldDuplicateRels) {
|
||||
spark
|
||||
.sqlContext()
|
||||
.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
|
||||
.flatMap(
|
||||
(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
|
||||
Encoders.bean(Relation.class))
|
||||
.filter((FilterFunction<Relation>) value -> value != null)
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(p.getClass(), p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
|
||||
}
|
||||
|
||||
private static List<Relation> createRelation(String value, boolean duplicate) {
|
||||
String[] line = value.split(",");
|
||||
if (!line[1].startsWith("10.")) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
List<Relation> relationList = new ArrayList<>();
|
||||
|
||||
String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
|
||||
final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
|
||||
|
||||
relationList
|
||||
.addAll(
|
||||
getRelations(
|
||||
citing,
|
||||
cited));
|
||||
|
||||
if (duplicate && line[1].endsWith(".refs")) {
|
||||
citing = ID_PREFIX + IdentifierFactory
|
||||
.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
|
||||
relationList.addAll(getRelations(citing, cited));
|
||||
}
|
||||
|
||||
return relationList;
|
||||
}
|
||||
|
||||
private static Collection<Relation> getRelations(String citing, String cited) {
|
||||
|
||||
return Arrays
|
||||
.asList(
|
||||
getRelation(citing, cited, ModelConstants.CITES),
|
||||
getRelation(cited, citing, ModelConstants.IS_CITED_BY));
|
||||
}
|
||||
|
||||
public static Relation getRelation(
|
||||
String source,
|
||||
String target,
|
||||
String relclass) {
|
||||
Relation r = new Relation();
|
||||
r.setCollectedfrom(getCollectedFrom());
|
||||
r.setSource(source);
|
||||
r.setTarget(target);
|
||||
r.setRelClass(relclass);
|
||||
r.setRelType(ModelConstants.RESULT_RESULT);
|
||||
r.setSubRelType(ModelConstants.CITATION);
|
||||
r
|
||||
.setDataInfo(
|
||||
getDataInfo());
|
||||
return r;
|
||||
}
|
||||
|
||||
public static List<KeyValue> getCollectedFrom() {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey(ModelConstants.OPENOCITATIONS_ID);
|
||||
kv.setValue(ModelConstants.OPENOCITATIONS_NAME);
|
||||
|
||||
return Arrays.asList(kv);
|
||||
}
|
||||
|
||||
public static DataInfo getDataInfo() {
|
||||
DataInfo di = new DataInfo();
|
||||
di.setInferred(false);
|
||||
di.setDeletedbyinference(false);
|
||||
di.setTrust(TRUST);
|
||||
|
||||
di
|
||||
.setProvenanceaction(
|
||||
getQualifier(OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||
return di;
|
||||
}
|
||||
|
||||
public static Qualifier getQualifier(String class_id, String class_name,
|
||||
String qualifierSchema) {
|
||||
Qualifier pa = new Qualifier();
|
||||
pa.setClassid(class_id);
|
||||
pa.setClassname(class_name);
|
||||
pa.setSchemeid(qualifierSchema);
|
||||
pa.setSchemename(qualifierSchema);
|
||||
return pa;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class GetOpenCitationsRefs implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(GetOpenCitationsRefs.class);
|
||||
|
||||
public static void main(final String[] args) throws IOException, ParseException {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
GetOpenCitationsRefs.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json"))));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String[] inputFile = parser.get("inputFile").split(";");
|
||||
log.info("inputFile {}", inputFile.toString());
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath {}", workingPath);
|
||||
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
log.info("hdfsNameNode {}", hdfsNameNode);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
||||
|
||||
for (String file : inputFile) {
|
||||
ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void doExtract(String inputFile, String workingPath, FileSystem fileSystem)
|
||||
throws IOException {
|
||||
|
||||
final Path path = new Path(inputFile);
|
||||
|
||||
FSDataInputStream oc_zip = fileSystem.open(path);
|
||||
|
||||
int count = 1;
|
||||
try (ZipInputStream zis = new ZipInputStream(oc_zip)) {
|
||||
ZipEntry entry = null;
|
||||
while ((entry = zis.getNextEntry()) != null) {
|
||||
|
||||
if (!entry.isDirectory()) {
|
||||
String fileName = entry.getName();
|
||||
fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count;
|
||||
count++;
|
||||
try (
|
||||
FSDataOutputStream out = fileSystem
|
||||
.create(new Path(workingPath + "/COCI/" + fileName + ".gz"));
|
||||
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
|
||||
|
||||
IOUtils.copy(zis, gzipOs);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,73 +0,0 @@
|
|||
package eu.dnetlib.dhp.actionmanager.scholix
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object SparkCreateActionset {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
val workingDirFolder = parser.get("workingDirFolder")
|
||||
log.info(s"workingDirFolder -> $workingDirFolder")
|
||||
|
||||
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result]
|
||||
implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
|
||||
|
||||
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||
.flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
|
||||
|
||||
|
||||
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
|
||||
|
||||
log.info("extract source and target Identifier involved in relations")
|
||||
|
||||
|
||||
log.info("save relation filtered")
|
||||
|
||||
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
|
||||
|
||||
log.info("saving entities")
|
||||
|
||||
val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
|
||||
|
||||
|
||||
entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
|
||||
entities
|
||||
.joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
|
||||
.map(p => p._1._2)
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,86 +0,0 @@
|
|||
package eu.dnetlib.dhp.actionmanager.scholix
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object SparkSaveActionSet {
|
||||
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/save_actionset.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
|
||||
spark.read.load(sourcePath).as[Oaf]
|
||||
.map(o => toActionSet(o))
|
||||
.filter(o => o != null)
|
||||
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,253 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* This class represent an instance of Pubmed Article extracted from the native XML
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
|
||||
public class PMArticle implements Serializable {
|
||||
|
||||
/**
|
||||
* the Pubmed Identifier
|
||||
*/
|
||||
private String pmid;
|
||||
/**
|
||||
* the DOI
|
||||
*/
|
||||
private String doi;
|
||||
/**
|
||||
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
|
||||
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
|
||||
*/
|
||||
private String date;
|
||||
/**
|
||||
* This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
|
||||
*/
|
||||
private PMJournal journal;
|
||||
/**
|
||||
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
|
||||
*/
|
||||
private String title;
|
||||
/**
|
||||
* English-language abstracts are taken directly from the published article.
|
||||
* If the article does not have a published abstract, the National Library of Medicine does not create one,
|
||||
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
|
||||
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
|
||||
*/
|
||||
private String description;
|
||||
/**
|
||||
* the language in which an article was published is recorded in <Language>.
|
||||
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
|
||||
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
|
||||
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
|
||||
*/
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. *
|
||||
*/
|
||||
private final List<PMSubject> subjects = new ArrayList<>();
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*/
|
||||
private final List<PMSubject> publicationTypes = new ArrayList<>();
|
||||
/**
|
||||
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
|
||||
*/
|
||||
private List<PMAuthor> authors = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
|
||||
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
|
||||
*/
|
||||
private final List<PMGrant> grants = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* get the DOI
|
||||
* @return a DOI
|
||||
*/
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the DOI
|
||||
* @param doi a DOI
|
||||
*/
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the Pubmed Identifier
|
||||
* @return the PMID
|
||||
*/
|
||||
public String getPmid() {
|
||||
return pmid;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the Pubmed Identifier
|
||||
* @param pmid the Pubmed Identifier
|
||||
*/
|
||||
public void setPmid(String pmid) {
|
||||
this.pmid = pmid;
|
||||
}
|
||||
|
||||
/**
|
||||
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
|
||||
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
|
||||
*
|
||||
* @return the Pubmed Date
|
||||
*/
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the pubmed Date
|
||||
* @param date
|
||||
*/
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
/**
|
||||
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
|
||||
* Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
|
||||
* The NLM journal title abbreviation is exported in the <MedlineTA> element.
|
||||
*
|
||||
* @return the pubmed Journal Extracted
|
||||
*/
|
||||
public PMJournal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the mapped pubmed Journal
|
||||
* @param journal
|
||||
*/
|
||||
public void setJournal(PMJournal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
/**
|
||||
* English-language abstracts are taken directly from the published article.
|
||||
* If the article does not have a published abstract, the National Library of Medicine does not create one,
|
||||
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
|
||||
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
|
||||
*
|
||||
* @return the extracted pubmed Title
|
||||
*/
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the pubmed title
|
||||
* @param title
|
||||
*/
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
/**
|
||||
* English-language abstracts are taken directly from the published article.
|
||||
* If the article does not have a published abstract, the National Library of Medicine does not create one,
|
||||
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
|
||||
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
|
||||
*
|
||||
* @return the Mapped Pubmed Article Abstracts
|
||||
*/
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the Mapped Pubmed Article Abstracts
|
||||
* @param description
|
||||
*/
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
|
||||
*
|
||||
* @return get the Mapped Authors lists
|
||||
*/
|
||||
public List<PMAuthor> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the Mapped Authors lists
|
||||
* @param authors
|
||||
*/
|
||||
public void setAuthors(List<PMAuthor> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*
|
||||
* @return the mapped Subjects
|
||||
*/
|
||||
public List<PMSubject> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* the language in which an article was published is recorded in <Language>.
|
||||
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
|
||||
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
|
||||
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
|
||||
*
|
||||
* @return The mapped Language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Set The mapped Language
|
||||
*
|
||||
* @param language the mapped Language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*
|
||||
* @return the mapped Publication Type
|
||||
*/
|
||||
public List<PMSubject> getPublicationTypes() {
|
||||
return publicationTypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
|
||||
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
|
||||
* @return the mapped grants
|
||||
*/
|
||||
|
||||
public List<PMGrant> getGrants() {
|
||||
return grants;
|
||||
}
|
||||
}
|
|
@ -1,29 +1,59 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed;
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* The type Pubmed author.
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
public class PMAuthor implements Serializable {
|
||||
|
||||
private String lastName;
|
||||
private String foreName;
|
||||
|
||||
/**
|
||||
* Gets last name.
|
||||
*
|
||||
* @return the last name
|
||||
*/
|
||||
public String getLastName() {
|
||||
return lastName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets last name.
|
||||
*
|
||||
* @param lastName the last name
|
||||
*/
|
||||
public void setLastName(String lastName) {
|
||||
this.lastName = lastName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets fore name.
|
||||
*
|
||||
* @return the fore name
|
||||
*/
|
||||
public String getForeName() {
|
||||
return foreName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets fore name.
|
||||
*
|
||||
* @param foreName the fore name
|
||||
*/
|
||||
public void setForeName(String foreName) {
|
||||
this.foreName = foreName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets full name.
|
||||
*
|
||||
* @return the full name
|
||||
*/
|
||||
public String getFullName() {
|
||||
return String
|
||||
.format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : "");
|
|
@ -0,0 +1,87 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
/**
|
||||
* The type Pm grant.
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
public class PMGrant {
|
||||
|
||||
private String grantID;
|
||||
private String agency;
|
||||
private String country;
|
||||
|
||||
/**
|
||||
* Instantiates a new Pm grant.
|
||||
*/
|
||||
public PMGrant() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new Pm grant.
|
||||
*
|
||||
* @param grantID the grant id
|
||||
* @param agency the agency
|
||||
* @param country the country
|
||||
*/
|
||||
public PMGrant(String grantID, String agency, String country) {
|
||||
this.grantID = grantID;
|
||||
this.agency = agency;
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets grant id.
|
||||
*
|
||||
* @return the grant id
|
||||
*/
|
||||
public String getGrantID() {
|
||||
return grantID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets grant id.
|
||||
*
|
||||
* @param grantID the grant id
|
||||
*/
|
||||
public void setGrantID(String grantID) {
|
||||
this.grantID = grantID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets agency.
|
||||
*
|
||||
* @return the agency
|
||||
*/
|
||||
public String getAgency() {
|
||||
return agency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets agency.
|
||||
*
|
||||
* @param agency the agency
|
||||
*/
|
||||
public void setAgency(String agency) {
|
||||
this.agency = agency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets country.
|
||||
*
|
||||
* @return the country
|
||||
*/
|
||||
public String getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets country.
|
||||
*
|
||||
* @param country the country
|
||||
*/
|
||||
public void setCountry(String country) {
|
||||
this.country = country;
|
||||
}
|
||||
}
|
|
@ -1,8 +1,13 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed;
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* The type Pm journal.
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
public class PMJournal implements Serializable {
|
||||
|
||||
private String issn;
|
||||
|
@ -11,42 +16,92 @@ public class PMJournal implements Serializable {
|
|||
private String date;
|
||||
private String title;
|
||||
|
||||
/**
|
||||
* Gets issn.
|
||||
*
|
||||
* @return the issn
|
||||
*/
|
||||
public String getIssn() {
|
||||
return issn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets issn.
|
||||
*
|
||||
* @param issn the issn
|
||||
*/
|
||||
public void setIssn(String issn) {
|
||||
this.issn = issn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets volume.
|
||||
*
|
||||
* @return the volume
|
||||
*/
|
||||
public String getVolume() {
|
||||
return volume;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets volume.
|
||||
*
|
||||
* @param volume the volume
|
||||
*/
|
||||
public void setVolume(String volume) {
|
||||
this.volume = volume;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets issue.
|
||||
*
|
||||
* @return the issue
|
||||
*/
|
||||
public String getIssue() {
|
||||
return issue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets issue.
|
||||
*
|
||||
* @param issue the issue
|
||||
*/
|
||||
public void setIssue(String issue) {
|
||||
this.issue = issue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets date.
|
||||
*
|
||||
* @return the date
|
||||
*/
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets date.
|
||||
*
|
||||
* @param date the date
|
||||
*/
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets title.
|
||||
*
|
||||
* @return the title
|
||||
*/
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets title.
|
||||
*
|
||||
* @param title the title
|
||||
*/
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
|
@ -1,40 +1,83 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed;
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
/**
|
||||
* The type Pubmed subject.
|
||||
*/
|
||||
public class PMSubject {
|
||||
private String value;
|
||||
private String meshId;
|
||||
private String registryNumber;
|
||||
|
||||
/**
|
||||
* Instantiates a new Pm subject.
|
||||
*/
|
||||
public PMSubject() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new Pm subject.
|
||||
*
|
||||
* @param value the value
|
||||
* @param meshId the mesh id
|
||||
* @param registryNumber the registry number
|
||||
*/
|
||||
public PMSubject(String value, String meshId, String registryNumber) {
|
||||
this.value = value;
|
||||
this.meshId = meshId;
|
||||
this.registryNumber = registryNumber;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets value.
|
||||
*
|
||||
* @return the value
|
||||
*/
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets value.
|
||||
*
|
||||
* @param value the value
|
||||
*/
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets mesh id.
|
||||
*
|
||||
* @return the mesh id
|
||||
*/
|
||||
public String getMeshId() {
|
||||
return meshId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets mesh id.
|
||||
*
|
||||
* @param meshId the mesh id
|
||||
*/
|
||||
public void setMeshId(String meshId) {
|
||||
this.meshId = meshId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets registry number.
|
||||
*
|
||||
* @return the registry number
|
||||
*/
|
||||
public String getRegistryNumber() {
|
||||
return registryNumber;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets registry number.
|
||||
*
|
||||
* @param registryNumber the registry number
|
||||
*/
|
||||
public void setRegistryNumber(String registryNumber) {
|
||||
this.registryNumber = registryNumber;
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,9 +1,5 @@
|
|||
<workflow-app name="BipFinderScore" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>inputPath</name>
|
||||
<description>the input path of the resources to be extended</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>bipScorePath</name>
|
||||
|
@ -13,8 +9,61 @@
|
|||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
<start to="deleteoutputpath"/>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
|
@ -30,14 +79,8 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="atomicactions">
|
||||
<path start="atomicactions_publication"/>
|
||||
<path start="atomicactions_dataset"/>
|
||||
<path start="atomicactions_orp"/>
|
||||
<path start="atomicactions_software"/>
|
||||
</fork>
|
||||
|
||||
<action name="atomicactions_publication">
|
||||
<action name="atomicactions">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
|
@ -54,113 +97,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="atomicactions_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for datasets</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="atomicactions_orp">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for orp</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="atomicactions_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for software</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_aa" to="collectandsave"/>
|
||||
|
||||
<action name="collectandsave">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>saves all the aa produced for the several types of results in the as output path</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}</arg>
|
||||
<arg>--inputPath</arg><arg>${bipScorePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
[
|
||||
{
|
||||
"paramName":"s",
|
||||
"paramLongName":"sourcePath",
|
||||
"paramDescription": "the path of the sequencial file to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"out",
|
||||
"paramLongName":"outputPath",
|
||||
"paramDescription": "the output path",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "ssm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "delimiter",
|
||||
"paramDescription": "the delimiter if different from the default one (,)",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -0,0 +1,30 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,237 @@
|
|||
|
||||
<workflow-app name="UnresolvedEntities" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>fosPath</name>
|
||||
<description>the input path of the resources to be extended</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>bipScorePath</name>
|
||||
<description>the path where to find the bipFinder scores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
<start to="prepareInfo"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<fork name="prepareInfo">
|
||||
<path start="prepareBip"/>
|
||||
<path start="getFOS"/>
|
||||
<path start="getSDG"/>
|
||||
</fork>
|
||||
|
||||
<action name="prepareBip">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the unresolved from bip finder!</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${bipScorePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
|
||||
</spark>
|
||||
<ok to="join"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="getFOS">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Gets Data from FOS csv file</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetFOSSparkJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${fosPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/input/fos</arg>
|
||||
</spark>
|
||||
<ok to="prepareFos"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="prepareFos">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the unresolved from FOS!</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareFOSSparkJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/input/fos</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="join"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="getSDG">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Gets Data from SDG csv file</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetSDGSparkJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sdgPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/input/sdg</arg>
|
||||
</spark>
|
||||
<ok to="prepareSDG"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="prepareSDG">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the unresolved from FOS!</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareSDGSparkJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/input/sdg</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="join"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<join name="join" to="produceUnresolved"/>
|
||||
|
||||
<action name="produceUnresolved">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Saves the result produced for bip and fos by grouping results with the same id</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.SparkSaveUnresolved</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/prepared</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -6,8 +6,8 @@
|
|||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramName": "sp",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
@ -16,17 +16,5 @@
|
|||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "rtn",
|
||||
"paramLongName": "resultTableName",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "bsp",
|
||||
"paramLongName": "bipScorePath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,20 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "sp",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
File diff suppressed because it is too large
Load Diff
|
@ -1,95 +0,0 @@
|
|||
<workflow-app name="Import_Datacite_and_transform_to_OAF" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>mainPath</name>
|
||||
<description>the working path of Datacite stores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oafTargetPath</name>
|
||||
<description>the target path where the OAF records are stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>blocksize</name>
|
||||
<value>100</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>exportLinks</name>
|
||||
<value>false</value>
|
||||
<description>instructs the transformation phase to produce the links or not</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="resume_from"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="TransformDatacite">${wf:conf('resumeFrom') eq 'TransformDatacite'}</case>
|
||||
<default to="ImportDatacite"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="ImportDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ImportDatacite</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--targetPath</arg><arg>${mainPath}/datacite_update</arg>
|
||||
<arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
<arg>--blocksize</arg><arg>${blocksize}</arg>
|
||||
</spark>
|
||||
<ok to="TransformDatacite"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="TransformDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>TransformJob</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--targetPath</arg><arg>${oafTargetPath}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--exportLinks</arg><arg>${exportLinks}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,84 +0,0 @@
|
|||
<workflow-app name="Generate_Datacite_and_Crossref_dump_for_Scholexplorer" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>datacitePath</name>
|
||||
<description>the path of Datacite spark dataset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>crossrefPath</name>
|
||||
<description>the path of Crossref spark dataset</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the path of Crossref spark dataset</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="ImportDatacite"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="ImportDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ImportDatacite</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${datacitePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/datacite_oaf</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--exportLinks</arg><arg>true</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="FilterCrossrefEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="FilterCrossrefEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>FilterCrossrefEntities</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.FilterCrossrefEntitiesSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${crossrefPath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/crossref_oaf</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,25 @@
|
|||
[
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the zipped opencitations file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
}, {
|
||||
"paramName": "sdr",
|
||||
"paramLongName": "shouldDuplicateRels",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -0,0 +1,20 @@
|
|||
[
|
||||
{
|
||||
"paramName": "if",
|
||||
"paramLongName": "inputFile",
|
||||
"paramDescription": "the zipped opencitations file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "hnn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,58 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorNumber</name>
|
||||
<value>4</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<value>15G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<value>6G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,2 @@
|
|||
#!/bin/bash
|
||||
for file in $(echo $1 | tr ";" "\n"); do curl -L $(echo $file | cut -d '@' -f 1 ) | hdfs dfs -put - $2/$(echo $file | cut -d '@' -f 2) ; done;
|
|
@ -0,0 +1,91 @@
|
|||
<workflow-app name="OpenCitations Integration" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resume_from"/>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||
<case to="extract">${wf:conf('resumeFrom') eq 'ExtractContent'}</case>
|
||||
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<action name="download">
|
||||
<shell xmlns="uri:oozie:shell-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapred.job.queue.name</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<exec>download.sh</exec>
|
||||
<argument>${filelist}</argument>
|
||||
<argument>${workingPath}/Original</argument>
|
||||
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
|
||||
<file>download.sh</file>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="extract"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="extract">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--inputFile</arg><arg>${inputFile}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</java>
|
||||
<ok to="create_actionset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="create_actionset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the AS for OC</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,8 @@
|
|||
[
|
||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"opencitationFile", "paramDescription": "the name of the file", "paramRequired": true},
|
||||
{"paramName":"issm", "paramLongName":"isSparkSessionManaged", "paramDescription": "the name of the activities orcid file", "paramRequired": false},
|
||||
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the name of the activities orcid file", "paramRequired": true}
|
||||
|
||||
]
|
|
@ -1,46 +1,52 @@
|
|||
<workflow-app name="Datacite_to_ActionSet_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Collect_Datacite" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<name>mainPath</name>
|
||||
<description>the working path of Datacite stores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path of Datacite ActionSet</description>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>blocksize</name>
|
||||
<value>100</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="ExportDataset"/>
|
||||
<start to="ImportDatacite"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="ExportDataset">
|
||||
<action name="ImportDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExportDataset</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode</class>
|
||||
<name>ImportDatacite</name>
|
||||
<class>eu.dnetlib.dhp.datacite.ImportDatacite</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--targetPath</arg><arg>${mainPath}/datacite_update</arg>
|
||||
<arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
<arg>--blocksize</arg><arg>${blocksize}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -7,8 +7,8 @@
|
|||
},
|
||||
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramName": "mo",
|
||||
"paramLongName": "mdstoreOutputVersion",
|
||||
"paramDescription": "the target mdstore path",
|
||||
"paramRequired": true
|
||||
},
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"CERN.ZENODO": {
|
||||
"openaire_id": "opendoar____::2659",
|
||||
"datacite_name": "Zenodo",
|
||||
"official_name": "ZENODO"
|
||||
},
|
||||
"TIB.PANGAEA": {
|
||||
"openaire_id": "re3data_____::r3d100010134",
|
||||
"datacite_name": "PANGAEA",
|
||||
"official_name": "PANGAEA"
|
||||
},
|
||||
"DRYAD.DRYAD": {
|
||||
"openaire_id": "re3data_____::r3d100000044",
|
||||
"datacite_name": "DRYAD",
|
||||
"official_name": "DRYAD"
|
||||
},
|
||||
"FIGSHARE.UCT": {
|
||||
"openaire_id": "re3data_____::r3d100012633",
|
||||
"datacite_name": "University of Cape Town (UCT)",
|
||||
"official_name": "ZivaHub"
|
||||
},
|
||||
"CSIC.DIGITAL": {
|
||||
"openaire_id": "re3data_____::r3d100011076",
|
||||
"datacite_name": "Digital CSIC",
|
||||
"official_name": "DIGITAL.CSIC"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
<workflow-app name="transform_Datacite" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>mainPath</name>
|
||||
<description>the working path of Datacite stores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreOutputId</name>
|
||||
<description>the identifier of the cleaned MDStore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="StartTransaction"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="TransformJob"/>
|
||||
<error to="EndReadRollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="TransformJob">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>TransformJob</name>
|
||||
<class>eu.dnetlib.dhp.datacite.GenerateDataciteDatasetSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--exportLinks</arg><arg>true</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="CommitVersion"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="CommitVersion">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="EndReadRollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>READ_UNLOCK</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="RollBack"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="Kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,51 @@
|
|||
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the PDB Database Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>database</name>
|
||||
<description>the PDB Database Working Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the Target Working dir path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ConvertDB"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ConvertDB">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Bio DB to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.bio.SparkTransformBioDatabaseToOAF</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--dbPath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--database</arg><arg>${database}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -3,5 +3,6 @@
|
|||
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the oaf path ", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"skipUpdate", "paramDescription": "skip update ", "paramRequired": false},
|
||||
{"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,105 @@
|
|||
<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the OAF MDStore Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>resumeFrom</name>
|
||||
<value>DownloadEBILinks</value>
|
||||
<description>node to start</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="resume_from"/>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
|
||||
<case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
|
||||
<default to="DownloadEBILinks"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="DownloadEBILinks">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Incremental Download EBI Links</name>
|
||||
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkDownloadEBILinks</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="OverrideFolders"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="OverrideFolders">
|
||||
<fs>
|
||||
<delete path="${sourcePath}/ebi_links_dataset_old"/>
|
||||
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
|
||||
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
|
||||
</fs>
|
||||
<ok to="CreateEBIDataSet"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="CreateEBIDataSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create OAF DataSet</name>
|
||||
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkEBILinksToOaf</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,17 +1,22 @@
|
|||
<workflow-app name="Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Download_Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>baselineWorkingPath</name>
|
||||
<description>the Baseline Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the Target Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>The target path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>skipUpdate</name>
|
||||
<value>false</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ConvertDataset"/>
|
||||
|
@ -24,9 +29,9 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Baseline to Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.ebi.SparkCreateBaselineDataFrame</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<name>Convert Baseline to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
|
@ -41,6 +46,8 @@
|
|||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||
<arg>--skipUpdate</arg><arg>${skipUpdate}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
|
@ -16,8 +16,4 @@
|
|||
<value>spark2</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,62 @@
|
|||
<workflow-app name="Retrieve Scholix Update" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path of scholix graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datacitePath</name>
|
||||
<description>the datacite native path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingSupportPath</name>
|
||||
<description>the working Support path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>updateDS</name>
|
||||
<value>false</value>
|
||||
<description>The transformation Rule to apply</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="RetrieveDeltaDatacite"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="RetrieveDeltaDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>New Update from Datacite to Scholix</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkRetrieveDataciteDelta</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=6000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--datacitePath</arg><arg>${datacitePath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--workingSupportPath</arg><arg>${workingSupportPath}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--updateDS</arg><arg>${updateDS}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -0,0 +1,41 @@
|
|||
[
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source mdstore path",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "datacitePath",
|
||||
"paramDescription": "the datacite native path",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingSupportPath",
|
||||
"paramDescription": "the working Support path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "the isLookup URL",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the master name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "updateDS",
|
||||
"paramDescription": "Need to regenerate all support Dataset",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
|
@ -7,3 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
|
|||
# A1 uses PatternLayout.
|
||||
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
||||
|
||||
log4j.logger.org.apache.spark=FATAL
|
||||
log4j.logger.org.spark_project=FATAL
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.dhp.collection
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}
|
||||
|
||||
object CollectionUtils {
|
||||
|
||||
/** This method in pipeline to the transformation phase,
|
||||
* generates relations in both verse, typically it should be a phase of flatMap
|
||||
*
|
||||
* @param i input OAF
|
||||
* @return
|
||||
* If the input OAF is an entity -> List(i)
|
||||
* If the input OAF is a relation -> List(relation, inverseRelation)
|
||||
*/
|
||||
|
||||
def fixRelations(i: Oaf): List[Oaf] = {
|
||||
if (i.isInstanceOf[OafEntity])
|
||||
return List(i)
|
||||
else {
|
||||
val r: Relation = i.asInstanceOf[Relation]
|
||||
val currentRel = ModelSupport.findRelation(r.getRelClass)
|
||||
if (currentRel != null) {
|
||||
|
||||
// Cleaning relation
|
||||
r.setRelType(currentRel.getRelType)
|
||||
r.setSubRelType(currentRel.getSubReltype)
|
||||
r.setRelClass(currentRel.getRelClass)
|
||||
val inverse = new Relation
|
||||
inverse.setSource(r.getTarget)
|
||||
inverse.setTarget(r.getSource)
|
||||
inverse.setRelType(currentRel.getRelType)
|
||||
inverse.setSubRelType(currentRel.getSubReltype)
|
||||
inverse.setRelClass(currentRel.getInverseRelClass)
|
||||
inverse.setCollectedfrom(r.getCollectedfrom)
|
||||
inverse.setDataInfo(r.getDataInfo)
|
||||
inverse.setProperties(r.getProperties)
|
||||
inverse.setLastupdatetimestamp(r.getLastupdatetimestamp)
|
||||
inverse.setValidated(r.getValidated)
|
||||
inverse.setValidationDate(r.getValidationDate)
|
||||
return List(r, inverse)
|
||||
}
|
||||
}
|
||||
List()
|
||||
}
|
||||
|
||||
def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = {
|
||||
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
val mapper = new ObjectMapper
|
||||
|
||||
dataset
|
||||
.flatMap(i => CollectionUtils.fixRelations(i))
|
||||
.filter(i => i != null)
|
||||
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(targetPath)
|
||||
}
|
||||
|
||||
}
|
|
@ -1,13 +1,10 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
|
||||
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
|
||||
import org.apache.http.entity.StringEntity
|
||||
import org.apache.http.impl.client.{HttpClientBuilder, HttpClients}
|
||||
|
||||
import java.io.IOException
|
||||
|
||||
import org.apache.http.impl.client.HttpClientBuilder
|
||||
|
||||
abstract class AbstractRestClient extends Iterator[String] {
|
||||
|
||||
|
@ -18,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
|
||||
var complete: Boolean = false
|
||||
|
||||
|
||||
def extractInfo(input: String): Unit
|
||||
|
||||
protected def getBufferData(): Unit
|
||||
|
||||
|
||||
def doHTTPGETRequest(url: String): String = {
|
||||
val httpGet = new HttpGet(url)
|
||||
doHTTPRequest(httpGet)
|
||||
|
@ -45,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
buffer.nonEmpty && current_index < buffer.size
|
||||
}
|
||||
|
||||
|
||||
override def next(): String = {
|
||||
val next_item: String = buffer(current_index)
|
||||
current_index = current_index + 1
|
||||
|
@ -54,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
next_item
|
||||
}
|
||||
|
||||
|
||||
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
|
@ -71,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
} else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
|
@ -89,4 +83,4 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
}
|
||||
|
||||
getBufferData()
|
||||
}
|
||||
}
|
|
@ -1,9 +1,9 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import org.json4s.{DefaultFormats, JValue}
|
||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
import org.json4s.{DefaultFormats, JValue}
|
||||
|
||||
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
|
||||
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1) extends AbstractRestClient {
|
||||
|
||||
override def extractInfo(input: String): Unit = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
@ -16,16 +16,18 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -
|
|||
current_index = 0
|
||||
}
|
||||
|
||||
def get_url():String ={
|
||||
val to = if (until> 0) s"$until" else "*"
|
||||
def get_url(): String = {
|
||||
val to = if (until > 0) s"$until" else "*"
|
||||
s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"
|
||||
|
||||
}
|
||||
|
||||
override def getBufferData(): Unit = {
|
||||
if (!complete) {
|
||||
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
|
||||
val response =
|
||||
if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
|
||||
else doHTTPGETRequest(get_url())
|
||||
extractInfo(response)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,278 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
|
||||
|
||||
import java.io.InputStream
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.Locale
|
||||
import java.util.regex.Pattern
|
||||
import scala.io.Source
|
||||
|
||||
/** This class represent the dataModel of the input Dataset of Datacite
|
||||
* @param doi THE DOI
|
||||
* @param timestamp timestamp of last update date
|
||||
* @param isActive the record is active or deleted
|
||||
* @param json the json native records
|
||||
*/
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
/*
|
||||
The following class are utility class used for the mapping from
|
||||
json datacite to OAF Shema
|
||||
*/
|
||||
case class RelatedIdentifierType(
|
||||
relationType: String,
|
||||
relatedIdentifier: String,
|
||||
relatedIdentifierType: String
|
||||
) {}
|
||||
|
||||
case class NameIdentifiersType(
|
||||
nameIdentifierScheme: Option[String],
|
||||
schemeUri: Option[String],
|
||||
nameIdentifier: Option[String]
|
||||
) {}
|
||||
|
||||
case class CreatorType(
|
||||
nameType: Option[String],
|
||||
nameIdentifiers: Option[List[NameIdentifiersType]],
|
||||
name: Option[String],
|
||||
familyName: Option[String],
|
||||
givenName: Option[String],
|
||||
affiliation: Option[List[String]]
|
||||
) {}
|
||||
|
||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||
|
||||
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
||||
|
||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||
|
||||
case class FundingReferenceType(
|
||||
funderIdentifierType: Option[String],
|
||||
awardTitle: Option[String],
|
||||
awardUri: Option[String],
|
||||
funderName: Option[String],
|
||||
funderIdentifier: Option[String],
|
||||
awardNumber: Option[String]
|
||||
) {}
|
||||
|
||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||
|
||||
case class OAFRelations(relation: String, inverse: String, relType: String)
|
||||
|
||||
class DataciteModelConstants extends Serializable {}
|
||||
|
||||
object DataciteModelConstants {
|
||||
|
||||
val REL_TYPE_VALUE: String = "resultResult"
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
val DATACITE_NAME = "Datacite"
|
||||
val dataInfo: DataInfo = dataciteDataInfo("0.9")
|
||||
|
||||
val DATACITE_COLLECTED_FROM: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
||||
|
||||
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
||||
ModelConstants.REFERENCES -> OAFRelations(
|
||||
ModelConstants.REFERENCES,
|
||||
ModelConstants.IS_REFERENCED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
|
||||
ModelConstants.IS_REFERENCED_BY,
|
||||
ModelConstants.REFERENCES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
ModelConstants.SUPPLEMENT
|
||||
),
|
||||
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||
ModelConstants.SUPPLEMENT
|
||||
),
|
||||
ModelConstants.HAS_PART -> OAFRelations(
|
||||
ModelConstants.HAS_PART,
|
||||
ModelConstants.IS_PART_OF,
|
||||
ModelConstants.PART
|
||||
),
|
||||
ModelConstants.IS_PART_OF -> OAFRelations(
|
||||
ModelConstants.IS_PART_OF,
|
||||
ModelConstants.HAS_PART,
|
||||
ModelConstants.PART
|
||||
),
|
||||
ModelConstants.IS_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_VERSION_OF,
|
||||
ModelConstants.HAS_VERSION,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.HAS_VERSION -> OAFRelations(
|
||||
ModelConstants.HAS_VERSION,
|
||||
ModelConstants.IS_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
|
||||
ModelConstants.IS_IDENTICAL_TO,
|
||||
ModelConstants.IS_IDENTICAL_TO,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
|
||||
ModelConstants.IS_CONTINUED_BY,
|
||||
ModelConstants.CONTINUES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.CONTINUES -> OAFRelations(
|
||||
ModelConstants.CONTINUES,
|
||||
ModelConstants.IS_CONTINUED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_SOURCE_OF -> OAFRelations(
|
||||
ModelConstants.IS_SOURCE_OF,
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.IS_SOURCE_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.CITES -> OAFRelations(
|
||||
ModelConstants.CITES,
|
||||
ModelConstants.IS_CITED_BY,
|
||||
ModelConstants.CITATION
|
||||
),
|
||||
ModelConstants.IS_CITED_BY -> OAFRelations(
|
||||
ModelConstants.IS_CITED_BY,
|
||||
ModelConstants.CITES,
|
||||
ModelConstants.CITATION
|
||||
),
|
||||
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
|
||||
ModelConstants.IS_VARIANT_FORM_OF,
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
|
||||
ModelConstants.IS_OBSOLETED_BY,
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.REVIEWS -> OAFRelations(
|
||||
ModelConstants.REVIEWS,
|
||||
ModelConstants.IS_REVIEWED_BY,
|
||||
ModelConstants.REVIEW
|
||||
),
|
||||
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
|
||||
ModelConstants.IS_REVIEWED_BY,
|
||||
ModelConstants.REVIEWS,
|
||||
ModelConstants.REVIEW
|
||||
),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.COMPILES -> OAFRelations(
|
||||
ModelConstants.COMPILES,
|
||||
ModelConstants.IS_COMPILED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_COMPILED_BY -> OAFRelations(
|
||||
ModelConstants.IS_COMPILED_BY,
|
||||
ModelConstants.COMPILES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
)
|
||||
)
|
||||
|
||||
val datacite_filter: List[String] = {
|
||||
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
||||
require(stream != null)
|
||||
Source.fromInputStream(stream).getLines().toList
|
||||
}
|
||||
|
||||
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
trust
|
||||
)
|
||||
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
|
||||
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
||||
Locale.ENGLISH
|
||||
)
|
||||
|
||||
val df_it: DateTimeFormatter =
|
||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda__h2020::"
|
||||
),
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda_______::"
|
||||
)
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile(
|
||||
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//M-D-Y
|
||||
Pattern.compile(
|
||||
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//D-M-Y
|
||||
Pattern.compile(
|
||||
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
)
|
||||
|
||||
}
|
|
@ -0,0 +1,652 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.datacite.DataciteModelConstants._
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import java.text.SimpleDateFormat
|
||||
import java.time.LocalDate
|
||||
import java.time.chrono.ThaiBuddhistDate
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.{Date, Locale}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.{Codec, Source}
|
||||
|
||||
object DataciteToOAFTransformation {
|
||||
|
||||
case class HostedByMapType(
|
||||
openaire_id: String,
|
||||
datacite_name: String,
|
||||
official_name: String,
|
||||
similarity: Option[Float]
|
||||
) {}
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val unknown_repository: HostedByMapType = HostedByMapType(
|
||||
ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID,
|
||||
ModelConstants.UNKNOWN_REPOSITORY.getValue,
|
||||
ModelConstants.UNKNOWN_REPOSITORY.getValue,
|
||||
Some(1.0f)
|
||||
)
|
||||
|
||||
val hostedByMap: Map[String, HostedByMapType] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(s)
|
||||
json.extract[Map[String, HostedByMapType]]
|
||||
}
|
||||
|
||||
/** This method should skip record if json contains invalid text
|
||||
* defined in gile datacite_filter
|
||||
*
|
||||
* @param json
|
||||
* @return True if the record should be skipped
|
||||
*/
|
||||
def skip_record(json: String): Boolean = {
|
||||
datacite_filter.exists(f => json.contains(f))
|
||||
}
|
||||
|
||||
@deprecated("this method will be removed", "dhp")
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def embargo_end(embargo_end_date: String): Boolean = {
|
||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||
val td = LocalDate.now()
|
||||
td.isAfter(dt)
|
||||
}
|
||||
|
||||
def extract_date(input: String): Option[String] = {
|
||||
val d = Date_regex
|
||||
.map(pattern => {
|
||||
val matcher = pattern.matcher(input)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
})
|
||||
.find(s => s != null)
|
||||
|
||||
if (d.isDefined) {
|
||||
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_en).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
return None
|
||||
}
|
||||
}
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
def fix_thai_date(input: String, format: String): String = {
|
||||
try {
|
||||
val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
|
||||
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
|
||||
LocalDate.from(d).toString
|
||||
} catch {
|
||||
case _: Throwable => ""
|
||||
}
|
||||
}
|
||||
|
||||
def getTypeQualifier(
|
||||
resourceType: String,
|
||||
resourceTypeGeneral: String,
|
||||
schemaOrg: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): (Qualifier, Qualifier) = {
|
||||
if (resourceType != null && resourceType.nonEmpty) {
|
||||
val typeQualifier =
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||
if (typeQualifier != null)
|
||||
return (
|
||||
typeQualifier,
|
||||
vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
typeQualifier.getClassid
|
||||
)
|
||||
)
|
||||
}
|
||||
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
||||
val typeQualifier =
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
||||
if (typeQualifier != null)
|
||||
return (
|
||||
typeQualifier,
|
||||
vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
typeQualifier.getClassid
|
||||
)
|
||||
)
|
||||
|
||||
}
|
||||
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
resourceTypeGeneral
|
||||
)
|
||||
if (typeQualifier != null)
|
||||
return (
|
||||
typeQualifier,
|
||||
vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
typeQualifier.getClassid
|
||||
)
|
||||
)
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
def getResult(
|
||||
resourceType: String,
|
||||
resourceTypeGeneral: String,
|
||||
schemaOrg: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): Result = {
|
||||
val typeQualifiers: (Qualifier, Qualifier) =
|
||||
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (typeQualifiers == null)
|
||||
return null
|
||||
val i = new Instance
|
||||
i.setInstancetype(typeQualifiers._1)
|
||||
typeQualifiers._2.getClassname match {
|
||||
case "dataset" =>
|
||||
val r = new OafDataset
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "publication" =>
|
||||
val r = new Publication
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "software" =>
|
||||
val r = new Software
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "other" =>
|
||||
val r = new OtherResearchProduct
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
def available_date(input: String): Boolean = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
val l: List[String] = for {
|
||||
JObject(dates) <- json \\ "dates"
|
||||
JField("dateType", JString(dateTypes)) <- dates
|
||||
} yield dateTypes
|
||||
|
||||
l.exists(p => p.equalsIgnoreCase("available"))
|
||||
|
||||
}
|
||||
|
||||
/** As describe in ticket #6377
|
||||
* when the result come from figshare we need to remove subject
|
||||
* and set Access rights OPEN.
|
||||
*
|
||||
* @param r
|
||||
*/
|
||||
def fix_figshare(r: Result): Unit = {
|
||||
|
||||
if (r.getInstance() != null) {
|
||||
val hosted_by_figshare = r
|
||||
.getInstance()
|
||||
.asScala
|
||||
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
if (hosted_by_figshare) {
|
||||
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
||||
val l: List[StructuredProperty] = List()
|
||||
r.setSubject(l.asJava)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
||||
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
||||
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
||||
}
|
||||
|
||||
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
||||
OafMapperUtils.structuredProperty(dt, q, null)
|
||||
}
|
||||
|
||||
def generateRelation(
|
||||
sourceId: String,
|
||||
targetId: String,
|
||||
relClass: String,
|
||||
cf: KeyValue,
|
||||
di: DataInfo
|
||||
): Relation = {
|
||||
|
||||
val r = new Relation
|
||||
r.setSource(sourceId)
|
||||
r.setTarget(targetId)
|
||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
||||
r.setRelClass(relClass)
|
||||
r.setSubRelType(ModelConstants.OUTCOME)
|
||||
r.setCollectedfrom(List(cf).asJava)
|
||||
r.setDataInfo(di)
|
||||
r
|
||||
|
||||
}
|
||||
|
||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
||||
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
|
||||
|
||||
if (match_pattern.isDefined) {
|
||||
val m = match_pattern.get._1
|
||||
val p = match_pattern.get._2
|
||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
|
||||
} else
|
||||
List()
|
||||
|
||||
}
|
||||
|
||||
def generateOAF(
|
||||
input: String,
|
||||
ts: Long,
|
||||
dateOfCollection: Long,
|
||||
vocabularies: VocabularyGroup,
|
||||
exportLinks: Boolean
|
||||
): List[Oaf] = {
|
||||
if (skip_record(input))
|
||||
return List()
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||
val resourceTypeGeneral =
|
||||
(json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
||||
|
||||
val doi = (json \ "attributes" \ "doi").extract[String]
|
||||
if (doi.isEmpty)
|
||||
return List()
|
||||
|
||||
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
|
||||
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (result == null)
|
||||
return List()
|
||||
|
||||
val doi_q = OafMapperUtils.qualifier(
|
||||
"doi",
|
||||
"doi",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
)
|
||||
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||
result.setPid(List(pid).asJava)
|
||||
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||
result.setOriginalId(List(doi).asJava)
|
||||
|
||||
val d = new Date(dateOfCollection * 1000)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
|
||||
result.setDateofcollection(ISO8601FORMAT.format(d))
|
||||
result.setDateoftransformation(ISO8601FORMAT.format(d))
|
||||
result.setDataInfo(dataInfo)
|
||||
|
||||
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
||||
|
||||
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
||||
val a = new Author
|
||||
a.setFullname(c.name.orNull)
|
||||
a.setName(c.givenName.orNull)
|
||||
a.setSurname(c.familyName.orNull)
|
||||
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||
a.setPid(
|
||||
c.nameIdentifiers.get
|
||||
.map(ni => {
|
||||
val q =
|
||||
if (ni.nameIdentifierScheme.isDefined)
|
||||
vocabularies.getTermAsQualifier(
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ni.nameIdentifierScheme.get.toLowerCase()
|
||||
)
|
||||
else null
|
||||
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
||||
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||
} else
|
||||
null
|
||||
|
||||
})
|
||||
.asJava
|
||||
)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(
|
||||
c.affiliation.get
|
||||
.filter(af => af.nonEmpty)
|
||||
.map(af => OafMapperUtils.field(af, dataInfo))
|
||||
.asJava
|
||||
)
|
||||
a.setRank(idx + 1)
|
||||
a
|
||||
}
|
||||
|
||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||
|
||||
result.setTitle(
|
||||
titles
|
||||
.filter(t => t.title.nonEmpty)
|
||||
.map(t => {
|
||||
if (t.titleType.isEmpty) {
|
||||
OafMapperUtils
|
||||
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
||||
} else {
|
||||
OafMapperUtils.structuredProperty(
|
||||
t.title.get,
|
||||
t.titleType.get,
|
||||
t.titleType.get,
|
||||
ModelConstants.DNET_DATACITE_TITLE,
|
||||
ModelConstants.DNET_DATACITE_TITLE,
|
||||
null
|
||||
)
|
||||
}
|
||||
})
|
||||
.asJava
|
||||
)
|
||||
|
||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||
return List()
|
||||
result.setAuthor(authors.asJava)
|
||||
|
||||
val dates = (json \\ "dates").extract[List[DateType]]
|
||||
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
|
||||
|
||||
val i_date = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
val a_date: Option[String] = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
.find(d => d != null && d.isDefined)
|
||||
.map(d => d.get)
|
||||
|
||||
if (a_date.isDefined) {
|
||||
if (doi.startsWith("10.14457"))
|
||||
result.setEmbargoenddate(
|
||||
OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)
|
||||
)
|
||||
else
|
||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||
}
|
||||
if (i_date.isDefined && i_date.get.isDefined) {
|
||||
if (doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
|
||||
)
|
||||
result
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
|
||||
)
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
}
|
||||
} else if (publication_year != null) {
|
||||
if (doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
|
||||
)
|
||||
result
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
|
||||
)
|
||||
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
}
|
||||
}
|
||||
|
||||
result.setRelevantdate(
|
||||
dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||
.filter(d => d._1.isDefined)
|
||||
.map(d =>
|
||||
(
|
||||
d._1.get,
|
||||
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
|
||||
)
|
||||
)
|
||||
.filter(d => d._2 != null)
|
||||
.map(d => generateOAFDate(d._1, d._2))
|
||||
.asJava
|
||||
)
|
||||
|
||||
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
||||
|
||||
result.setSubject(
|
||||
subjects
|
||||
.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
s.subject.get,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
null
|
||||
)
|
||||
)
|
||||
.asJava
|
||||
)
|
||||
|
||||
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
|
||||
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
||||
|
||||
result.setDescription(
|
||||
descriptions
|
||||
.filter(d => d.description.isDefined)
|
||||
.map(d => OafMapperUtils.field(d.description.get, null))
|
||||
.filter(s => s != null)
|
||||
.asJava
|
||||
)
|
||||
|
||||
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
||||
if (publisher != null)
|
||||
result.setPublisher(OafMapperUtils.field(publisher, null))
|
||||
|
||||
val language: String = (json \\ "language").extractOrElse[String](null)
|
||||
|
||||
if (language != null)
|
||||
result.setLanguage(
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
|
||||
)
|
||||
|
||||
val instance = result.getInstance().get(0)
|
||||
|
||||
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
|
||||
|
||||
val accessRights: List[String] = for {
|
||||
JObject(rightsList) <- json \\ "rightsList"
|
||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||
} yield rightsUri
|
||||
|
||||
val aRights: Option[AccessRight] = accessRights
|
||||
.map(r => {
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||
})
|
||||
.find(q => q != null)
|
||||
.map(q => {
|
||||
val a = new AccessRight
|
||||
a.setClassid(q.getClassid)
|
||||
a.setClassname(q.getClassname)
|
||||
a.setSchemeid(q.getSchemeid)
|
||||
a.setSchemename(q.getSchemename)
|
||||
a
|
||||
})
|
||||
|
||||
val access_rights_qualifier =
|
||||
if (aRights.isDefined) aRights.get
|
||||
else
|
||||
OafMapperUtils.accessRight(
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
|
||||
if (client.isDefined) {
|
||||
|
||||
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
|
||||
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
|
||||
|
||||
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
||||
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
||||
instance.setAccessright(access_rights_qualifier)
|
||||
instance.setPid(result.getPid)
|
||||
val license = accessRights
|
||||
.find(r =>
|
||||
r.startsWith("http") && r.matches(
|
||||
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
|
||||
)
|
||||
)
|
||||
if (license.isDefined)
|
||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||
}
|
||||
|
||||
val awardUris: List[String] = for {
|
||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||
} yield awardUri
|
||||
|
||||
result.setId(IdentifierFactory.createIdentifier(result))
|
||||
var relations: List[Relation] =
|
||||
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
|
||||
fix_figshare(result)
|
||||
|
||||
if (result.getId == null)
|
||||
return List()
|
||||
|
||||
if (exportLinks) {
|
||||
val rels: List[RelatedIdentifierType] = for {
|
||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
||||
JField("relationType", JString(relationType)) <- relIdentifier
|
||||
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
|
||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
||||
|
||||
relations = relations ::: generateRelations(
|
||||
rels,
|
||||
result.getId,
|
||||
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
|
||||
)
|
||||
}
|
||||
if (relations != null && relations.nonEmpty) {
|
||||
List(result) ::: relations
|
||||
} else
|
||||
List(result)
|
||||
}
|
||||
|
||||
private def generateRelations(
|
||||
rels: List[RelatedIdentifierType],
|
||||
id: String,
|
||||
date: String
|
||||
): List[Relation] = {
|
||||
rels
|
||||
.filter(r =>
|
||||
subRelTypeMapping
|
||||
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||
)
|
||||
.map(r => {
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.setDataInfo(dataInfo)
|
||||
|
||||
val subRelType = subRelTypeMapping(r.relationType).relType
|
||||
rel.setRelType(REL_TYPE_VALUE)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(r.relationType)
|
||||
|
||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.setSource(id)
|
||||
rel.setTarget(
|
||||
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
|
||||
)
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
||||
rel
|
||||
})
|
||||
}
|
||||
|
||||
def generateDSId(input: String): String = {
|
||||
val b = StringUtils.substringBefore(input, "::")
|
||||
val a = StringUtils.substringAfter(input, "::")
|
||||
s"10|$b::${DHPUtils.md5(a)}"
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"SourcePath is '$sourcePath'")
|
||||
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
|
||||
log.info(s"exportLinks is '$exportLinks'")
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
require(vocabularies != null)
|
||||
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
log.info(s"outputBasePath is '$outputBasePath'")
|
||||
val targetPath = s"$outputBasePath$MDSTORE_DATA_PATH"
|
||||
log.info(s"targetPath is '$targetPath'")
|
||||
|
||||
generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark)
|
||||
|
||||
reportTotalSize(targetPath, outputBasePath)
|
||||
}
|
||||
|
||||
/** For working with MDStore we need to store in a file on hdfs the size of
|
||||
* the current dataset
|
||||
* @param targetPath
|
||||
* @param outputBasePath
|
||||
*/
|
||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||
val total_items = spark.read.text(targetPath).count()
|
||||
writeHdfsFile(
|
||||
spark.sparkContext.hadoopConfiguration,
|
||||
s"$total_items",
|
||||
outputBasePath + MDSTORE_SIZE_PATH
|
||||
)
|
||||
}
|
||||
|
||||
/** Generate the transformed and cleaned OAF Dataset from the native one
|
||||
*
|
||||
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
|
||||
* @param exportLinks If true it generates unresolved links
|
||||
* @param vocabularies vocabularies for cleaning
|
||||
* @param targetPath the targetPath of the result Dataset
|
||||
*/
|
||||
def generateDataciteDataset(
|
||||
sourcePath: String,
|
||||
exportLinks: Boolean,
|
||||
vocabularies: VocabularyGroup,
|
||||
targetPath: String,
|
||||
spark: SparkSession
|
||||
): Unit = {
|
||||
require(spark != null)
|
||||
import spark.implicits._
|
||||
|
||||
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
|
||||
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
CollectionUtils.saveDataset(
|
||||
spark.read
|
||||
.load(sourcePath)
|
||||
.as[DataciteType]
|
||||
.filter(d => d.isActive)
|
||||
.flatMap(d =>
|
||||
DataciteToOAFTransformation
|
||||
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)
|
||||
)
|
||||
.filter(d => d != null),
|
||||
targetPath
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
object GenerateDataciteDatasetSpark {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new GenerateDataciteDatasetSpark(
|
||||
"/eu/dnetlib/dhp/datacite/generate_dataset_params.json",
|
||||
args,
|
||||
log
|
||||
).initialize().run()
|
||||
}
|
||||
}
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
|
||||
|
@ -9,21 +8,20 @@ import org.apache.hadoop.io.{IntWritable, SequenceFile, Text}
|
|||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.functions.max
|
||||
import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.apache.spark.sql.functions.max
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import java.time.format.DateTimeFormatter._
|
||||
import java.time.{LocalDate, LocalDateTime, ZoneOffset}
|
||||
import java.time.format.DateTimeFormatter.ISO_DATE_TIME
|
||||
import java.time.{LocalDateTime, ZoneOffset}
|
||||
import scala.io.Source
|
||||
|
||||
object ImportDatacite {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
|
||||
|
||||
|
||||
def convertAPIStringToDataciteItem(input: String): DataciteType = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
|
@ -33,14 +31,26 @@ object ImportDatacite {
|
|||
|
||||
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
|
||||
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
|
||||
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
|
||||
DataciteType(
|
||||
doi = doi,
|
||||
timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
|
||||
isActive = isActive,
|
||||
json = input
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
|
||||
)
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
|
||||
|
@ -61,7 +71,8 @@ object ImportDatacite {
|
|||
val spkipImport = parser.get("skipImport")
|
||||
log.info(s"skipImport is $spkipImport")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder()
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.appName(ImportDatacite.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
@ -79,45 +90,48 @@ object ImportDatacite {
|
|||
|
||||
import spark.implicits._
|
||||
|
||||
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
|
||||
new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
||||
|
||||
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
||||
override def zero: DataciteType = null
|
||||
|
||||
override def zero: DataciteType = null
|
||||
|
||||
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
if (b == null)
|
||||
return a
|
||||
if (a == null)
|
||||
return b
|
||||
if (a.timestamp > b.timestamp) {
|
||||
return a
|
||||
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
if (b == null)
|
||||
return a
|
||||
if (a == null)
|
||||
return b
|
||||
if (a.timestamp > b.timestamp) {
|
||||
return a
|
||||
}
|
||||
b
|
||||
}
|
||||
b
|
||||
|
||||
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
reduce(a, b)
|
||||
}
|
||||
|
||||
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def finish(reduction: DataciteType): DataciteType = reduction
|
||||
}
|
||||
|
||||
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
reduce(a, b)
|
||||
}
|
||||
|
||||
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def finish(reduction: DataciteType): DataciteType = reduction
|
||||
}
|
||||
|
||||
val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
|
||||
val ts = dump.select(max("timestamp")).first().getLong(0)
|
||||
|
||||
println(s"last Timestamp is $ts")
|
||||
|
||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||
val cnt =
|
||||
if ("true".equalsIgnoreCase(spkipImport)) 1
|
||||
else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||
|
||||
println(s"Imported from Datacite API $cnt documents")
|
||||
|
||||
if (cnt > 0) {
|
||||
|
||||
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
||||
val inputRdd: RDD[DataciteType] = sc
|
||||
.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
||||
.map(s => s._2.toString)
|
||||
.map(s => convertAPIStringToDataciteItem(s))
|
||||
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
|
||||
|
@ -130,7 +144,9 @@ object ImportDatacite {
|
|||
.agg(dataciteAggregator.toColumn)
|
||||
.map(s => s._2)
|
||||
.repartition(4000)
|
||||
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${dataciteDump}_updated")
|
||||
|
||||
val fs = FileSystem.get(sc.hadoopConfiguration)
|
||||
fs.delete(new Path(s"$dataciteDump"), true)
|
||||
|
@ -138,14 +154,24 @@ object ImportDatacite {
|
|||
}
|
||||
}
|
||||
|
||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
|
||||
var from:Long = timestamp * 1000
|
||||
val delta:Long = 100000000L
|
||||
private def writeSequenceFile(
|
||||
hdfsTargetPath: Path,
|
||||
timestamp: Long,
|
||||
conf: Configuration,
|
||||
bs: Int
|
||||
): Long = {
|
||||
var from: Long = timestamp * 1000
|
||||
val delta: Long = 100000000L
|
||||
var client: DataciteAPIImporter = null
|
||||
val now :Long =System.currentTimeMillis()
|
||||
val now: Long = System.currentTimeMillis()
|
||||
var i = 0
|
||||
try {
|
||||
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
|
||||
val writer = SequenceFile.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfsTargetPath),
|
||||
SequenceFile.Writer.keyClass(classOf[IntWritable]),
|
||||
SequenceFile.Writer.valueClass(classOf[Text])
|
||||
)
|
||||
try {
|
||||
var start: Long = System.currentTimeMillis
|
||||
while (from < now) {
|
||||
|
@ -154,33 +180,32 @@ object ImportDatacite {
|
|||
val key: IntWritable = new IntWritable(i)
|
||||
val value: Text = new Text
|
||||
while (client.hasNext) {
|
||||
key.set({
|
||||
key.set {
|
||||
i += 1;
|
||||
i - 1
|
||||
})
|
||||
}
|
||||
value.set(client.next())
|
||||
writer.append(key, value)
|
||||
writer.hflush()
|
||||
if (i % 1000 == 0) {
|
||||
end = System.currentTimeMillis
|
||||
val time = (end - start) / 1000.0F
|
||||
val time = (end - start) / 1000.0f
|
||||
println(s"Imported $i in $time seconds")
|
||||
start = System.currentTimeMillis
|
||||
}
|
||||
}
|
||||
println(s"updating from value: $from -> ${from+delta}")
|
||||
println(s"updating from value: $from -> ${from + delta}")
|
||||
from = from + delta
|
||||
}
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
println("Error", e)
|
||||
} finally if (writer != null) writer.close()
|
||||
}
|
||||
catch {
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
log.error("Error", e)
|
||||
}
|
||||
i
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,18 +1,14 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.LocalFileSystem
|
||||
import org.apache.hadoop.hdfs.DistributedFileSystem
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.apache.spark.sql.functions.max
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import java.text.SimpleDateFormat
|
||||
import java.util.{Date, Locale}
|
||||
import java.util.Locale
|
||||
import scala.io.Source
|
||||
|
||||
object SparkDownloadUpdateDatacite {
|
||||
|
@ -21,7 +17,13 @@ object SparkDownloadUpdateDatacite {
|
|||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
|
@ -30,8 +32,9 @@ object SparkDownloadUpdateDatacite {
|
|||
val hdfsuri = parser.get("namenode")
|
||||
log.info(s"namenode is $hdfsuri")
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
@ -41,13 +44,18 @@ object SparkDownloadUpdateDatacite {
|
|||
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val maxDate:String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
|
||||
val maxDate: String = spark.read
|
||||
.load(workingPath)
|
||||
.as[Oaf]
|
||||
.filter(s => s.isInstanceOf[Result])
|
||||
.map(r => r.asInstanceOf[Result].getDateofcollection)
|
||||
.select(max("value"))
|
||||
.first()
|
||||
.getString(0)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
val string_to_date =ISO8601FORMAT.parse(maxDate)
|
||||
val string_to_date = ISO8601FORMAT.parse(maxDate)
|
||||
val ts = string_to_date.getTime
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,597 @@
|
|||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
import collection.JavaConverters._
|
||||
|
||||
object BioDBToOAF {
|
||||
|
||||
case class EBILinkItem(id: Long, links: String) {}
|
||||
|
||||
case class EBILinks(
|
||||
relType: String,
|
||||
date: String,
|
||||
title: String,
|
||||
pmid: String,
|
||||
targetPid: String,
|
||||
targetPidType: String,
|
||||
targetUrl: String
|
||||
) {}
|
||||
|
||||
case class UniprotDate(date: String, date_info: String) {}
|
||||
|
||||
case class ScholixResolved(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
typology: String,
|
||||
tilte: List[String],
|
||||
datasource: List[String],
|
||||
date: List[String],
|
||||
authors: List[String]
|
||||
) {}
|
||||
|
||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
"0.9"
|
||||
)
|
||||
val SUBJ_CLASS = "Keywords"
|
||||
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
|
||||
val resolvedURL: Map[String, String] = Map(
|
||||
"genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
|
||||
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
|
||||
"clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
|
||||
"onim" -> "https://omim.org/entry/",
|
||||
"refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
||||
)
|
||||
|
||||
val collectedFromMap: Map[String, KeyValue] = {
|
||||
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2",
|
||||
"Protein Data Bank"
|
||||
)
|
||||
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::c2a591f440598b63d854556beaf01591",
|
||||
"European Nucleotide Archive"
|
||||
)
|
||||
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6",
|
||||
"NCBI Nucleotide"
|
||||
)
|
||||
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::296e1abaf1302897a6838d3588cd0310",
|
||||
"UniProtKB/Swiss-Prot"
|
||||
)
|
||||
val ElsevierCollectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
||||
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e",
|
||||
"Springer Nature"
|
||||
)
|
||||
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|opendoar____::83e60e09c222f206c725385f53d7e567c",
|
||||
"EMBL-EBIs Protein Data Bank in Europe (PDBe)"
|
||||
)
|
||||
val pubmedCollectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
|
||||
PDBCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ElsevierCollectedFrom.setDataInfo(DATA_INFO)
|
||||
EBICollectedFrom.setDataInfo(DATA_INFO)
|
||||
pubmedCollectedFrom.setDataInfo(DATA_INFO)
|
||||
enaCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ncbiCollectedFrom.setDataInfo(DATA_INFO)
|
||||
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
|
||||
|
||||
Map(
|
||||
"uniprot" -> UNIPROTCollectedFrom,
|
||||
"pdb" -> PDBCollectedFrom,
|
||||
"elsevier" -> ElsevierCollectedFrom,
|
||||
"ebi" -> EBICollectedFrom,
|
||||
"Springer Nature" -> springerNatureCollectedFrom,
|
||||
"NCBI Nucleotide" -> ncbiCollectedFrom,
|
||||
"European Nucleotide Archive" -> enaCollectedFrom,
|
||||
"Europe PMC" -> pubmedCollectedFrom
|
||||
)
|
||||
}
|
||||
|
||||
def crossrefLinksToOaf(input: String): Oaf = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
|
||||
|
||||
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
|
||||
|
||||
createRelation(
|
||||
target_pid,
|
||||
target_pid_type,
|
||||
generate_unresolved_id(source_pid, source_pid_type),
|
||||
collectedFromMap("elsevier"),
|
||||
"relationship",
|
||||
relation_semantic,
|
||||
date
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.pid.toLowerCase,
|
||||
input.pidType.toLowerCase,
|
||||
input.pidType.toLowerCase,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
|
||||
val nsPrefix = input.pidType.toLowerCase.padTo(12, '_')
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
|
||||
|
||||
if (input.tilte != null && input.tilte.nonEmpty)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.tilte.head,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setOriginalId(List(input.pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
|
||||
if (resolvedURL.contains(input.pidType)) {
|
||||
i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava)
|
||||
}
|
||||
|
||||
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0037",
|
||||
"Clinical Trial",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
else
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
if (input.datasource == null || input.datasource.isEmpty)
|
||||
return null
|
||||
|
||||
val ds = input.datasource.head
|
||||
d.setCollectedfrom(List(collectedFromMap(ds)).asJava)
|
||||
i.setCollectedfrom(collectedFromMap(ds))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
if (input.authors != null && input.authors.nonEmpty) {
|
||||
val authors = input.authors.map(a => {
|
||||
val authorOAF = new Author
|
||||
authorOAF.setFullname(a)
|
||||
authorOAF
|
||||
})
|
||||
d.setAuthor(authors.asJava)
|
||||
}
|
||||
if (input.date != null && input.date.nonEmpty) {
|
||||
val dt = input.date.head
|
||||
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
def uniprotToOAF(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pid = (json \ "pid").extract[String]
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
pid,
|
||||
"uniprot",
|
||||
"uniprot",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true))
|
||||
d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava)
|
||||
|
||||
val title: String = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title != null)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setOriginalId(List(pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("uniprot"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val dates: List[UniprotDate] = for {
|
||||
JObject(dateOBJ) <- json \ "dates"
|
||||
JField("date", JString(date)) <- dateOBJ
|
||||
JField("date_info", JString(date_info)) <- dateOBJ
|
||||
} yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
|
||||
|
||||
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
|
||||
|
||||
if (subjects != null) {
|
||||
d.setSubject(
|
||||
subjects
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
s,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
null
|
||||
)
|
||||
)
|
||||
.asJava
|
||||
)
|
||||
}
|
||||
var i_date: Option[UniprotDate] = None
|
||||
|
||||
if (dates.nonEmpty) {
|
||||
i_date = dates.find(d => d.date_info.contains("entry version"))
|
||||
if (i_date.isDefined) {
|
||||
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
val relevant_dates: List[StructuredProperty] = dates
|
||||
.filter(d => !d.date_info.contains("entry version"))
|
||||
.map(date =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
date.date,
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
DATA_INFO
|
||||
)
|
||||
)
|
||||
if (relevant_dates != null && relevant_dates.nonEmpty)
|
||||
d.setRelevantdate(relevant_dates.asJava)
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
|
||||
val references_pmid: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField("PubMed", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
val references_doi: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField(" DOI", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
if (references_pmid != null && references_pmid.nonEmpty) {
|
||||
val rel = createRelation(
|
||||
references_pmid.head,
|
||||
"pmid",
|
||||
d.getId,
|
||||
collectedFromMap("uniprot"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
if (i_date.isDefined) i_date.get.date else null
|
||||
)
|
||||
rel.getCollectedfrom
|
||||
List(d, rel)
|
||||
} else if (references_doi != null && references_doi.nonEmpty) {
|
||||
val rel = createRelation(
|
||||
references_doi.head,
|
||||
"doi",
|
||||
d.getId,
|
||||
collectedFromMap("uniprot"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
if (i_date.isDefined) i_date.get.date else null
|
||||
)
|
||||
List(d, rel)
|
||||
} else
|
||||
List(d)
|
||||
}
|
||||
|
||||
def generate_unresolved_id(pid: String, pidType: String): String = {
|
||||
s"unresolved::$pid::$pidType"
|
||||
}
|
||||
|
||||
def createRelation(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
sourceId: String,
|
||||
collectedFrom: KeyValue,
|
||||
subRelType: String,
|
||||
relClass: String,
|
||||
date: String
|
||||
): Relation = {
|
||||
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
rel.setDataInfo(DATA_INFO)
|
||||
|
||||
rel.setRelType(ModelConstants.RESULT_RESULT)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(relClass)
|
||||
|
||||
rel.setSource(sourceId)
|
||||
rel.setTarget(s"unresolved::$pid::$pidType")
|
||||
|
||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.getTarget.startsWith("unresolved")
|
||||
rel.setCollectedfrom(List(collectedFrom).asJava)
|
||||
rel
|
||||
|
||||
}
|
||||
|
||||
def createSupplementaryRelation(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
sourceId: String,
|
||||
collectedFrom: KeyValue,
|
||||
date: String
|
||||
): Relation = {
|
||||
createRelation(
|
||||
pid,
|
||||
pidType,
|
||||
sourceId,
|
||||
collectedFrom,
|
||||
ModelConstants.SUPPLEMENT,
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
date
|
||||
)
|
||||
}
|
||||
|
||||
def pdbTOOaf(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pdb = (json \ "pdb").extract[String].toLowerCase
|
||||
|
||||
if (pdb.isEmpty)
|
||||
return List()
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
pdb,
|
||||
"pdb",
|
||||
"Protein Data Bank Identifier",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true))
|
||||
d.setOriginalId(List(pdb).asJava)
|
||||
|
||||
val title = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title == null)
|
||||
return List()
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
|
||||
|
||||
if (authors != null) {
|
||||
val convertedAuthors = authors.zipWithIndex.map { a =>
|
||||
val res = new Author
|
||||
res.setFullname(a._1)
|
||||
res.setRank(a._2 + 1)
|
||||
res
|
||||
}
|
||||
|
||||
d.setAuthor(convertedAuthors.asJava)
|
||||
}
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("pdb"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val pmid = (json \ "pmid").extractOrElse[String](null)
|
||||
|
||||
if (pmid != null)
|
||||
List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null))
|
||||
else
|
||||
List(d)
|
||||
}
|
||||
|
||||
def extractEBILinksFromDump(input: String): EBILinkItem = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val pmid = (json \ "publication" \ "pmid").extract[String]
|
||||
val links = (json \ "links").extract[JObject]
|
||||
EBILinkItem(pmid.toLong, compact(render(links)))
|
||||
}
|
||||
|
||||
def EBITargetLinksFilter(input: EBILinks): Boolean = {
|
||||
|
||||
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase(
|
||||
"pdb"
|
||||
) || input.targetPidType.equalsIgnoreCase("uniprot")
|
||||
|
||||
}
|
||||
|
||||
def parse_ebi_links(input: String): List[EBILinks] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pmid = (json \ "request" \ "id").extract[String]
|
||||
for {
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("RelationshipType", JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate", JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("IDURL", JString(idUrl)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
|
||||
} yield EBILinks(
|
||||
relation,
|
||||
GraphCleaningFunctions.cleanDate(publicationDate),
|
||||
title,
|
||||
pmid,
|
||||
id,
|
||||
idScheme,
|
||||
idUrl
|
||||
)
|
||||
}
|
||||
|
||||
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
|
||||
val d = new Dataset
|
||||
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.title,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
|
||||
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
|
||||
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.targetPid.toLowerCase,
|
||||
input.targetPidType.toLowerCase,
|
||||
"Protein Data Bank Identifier",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(input.targetUrl).asJava)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("ebi"))
|
||||
d.setInstance(List(i).asJava)
|
||||
i.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||
)
|
||||
d.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||
)
|
||||
|
||||
List(
|
||||
d,
|
||||
createRelation(
|
||||
input.pmid,
|
||||
"pmid",
|
||||
d.getId,
|
||||
collectedFromMap("ebi"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
GraphCleaningFunctions.cleanDate(input.date)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkTransformBioDatabaseToOAF {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val database: String = parser.get("database")
|
||||
log.info("database: {}", database)
|
||||
|
||||
val dbPath: String = parser.get("dbPath")
|
||||
log.info("dbPath: {}", database)
|
||||
val targetPath: String = parser.get("targetPath")
|
||||
log.info("targetPath: {}", database)
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
val sc = spark.sparkContext
|
||||
|
||||
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
import spark.implicits._
|
||||
database.toUpperCase() match {
|
||||
case "UNIPROT" =>
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
|
||||
targetPath
|
||||
)
|
||||
case "PDB" =>
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
|
||||
targetPath
|
||||
)
|
||||
case "SCHOLIX" =>
|
||||
CollectionUtils.saveDataset(
|
||||
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
|
||||
targetPath
|
||||
)
|
||||
case "CROSSREF_LINKS" =>
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
|
||||
targetPath
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,9 +1,10 @@
|
|||
package eu.dnetlib.dhp.sx.graph.ebi
|
||||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.oaf.Result
|
||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed._
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -23,31 +24,37 @@ import scala.xml.pull.XMLEventReader
|
|||
|
||||
object SparkCreateBaselineDataFrame {
|
||||
|
||||
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
|
||||
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
|
||||
|
||||
def requestBaseLineUpdatePage(maxFile:String):List[(String,String)] = {
|
||||
val data =requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
|
||||
val result = data.lines
|
||||
.filter(l => l.startsWith("<a href="))
|
||||
.map { l =>
|
||||
val end = l.lastIndexOf("\">")
|
||||
val start = l.indexOf("<a href=\"")
|
||||
|
||||
val result =data.lines.filter(l => l.startsWith("<a href=")).map{l =>
|
||||
val end = l.lastIndexOf("\">")
|
||||
val start = l.indexOf("<a href=\"")
|
||||
|
||||
if (start>= 0 && end >start)
|
||||
l.substring(start+9, (end-start))
|
||||
else
|
||||
""
|
||||
}.filter(s =>s.endsWith(".gz") ).filter(s => s > maxFile).map(s => (s,s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
|
||||
if (start >= 0 && end > start)
|
||||
l.substring(start + 9, end - start)
|
||||
else
|
||||
""
|
||||
}
|
||||
.filter(s => s.endsWith(".gz"))
|
||||
.filter(s => s > maxFile)
|
||||
.map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s"))
|
||||
.toList
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
def downloadBaselinePart(url:String):InputStream = {
|
||||
def downloadBaselinePart(url: String): InputStream = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
|
@ -55,13 +62,15 @@ object SparkCreateBaselineDataFrame {
|
|||
|
||||
}
|
||||
|
||||
def requestPage(url:String):String = {
|
||||
def requestPage(url: String): String = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
|
@ -72,8 +81,7 @@ object SparkCreateBaselineDataFrame {
|
|||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
} else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
|
@ -89,26 +97,20 @@ object SparkCreateBaselineDataFrame {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def downloadBaseLineUpdate(baselinePath:String, hdfsServerUri:String ):Unit = {
|
||||
|
||||
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
|
||||
|
||||
val conf = new Configuration
|
||||
conf.set("fs.defaultFS", hdfsServerUri)
|
||||
val fs = FileSystem.get(conf)
|
||||
val fs = FileSystem.get(conf)
|
||||
val p = new Path(baselinePath)
|
||||
val files = fs.listFiles(p,false)
|
||||
val files = fs.listFiles(p, false)
|
||||
var max_file = ""
|
||||
while (files.hasNext) {
|
||||
val c = files.next()
|
||||
val data = c.getPath.toString
|
||||
val fileName = data.substring(data.lastIndexOf("/")+1)
|
||||
val fileName = data.substring(data.lastIndexOf("/") + 1)
|
||||
|
||||
if (fileName> max_file)
|
||||
if (fileName > max_file)
|
||||
max_file = fileName
|
||||
}
|
||||
|
||||
|
@ -118,42 +120,43 @@ object SparkCreateBaselineDataFrame {
|
|||
val hdfsWritePath: Path = new Path(s"$baselinePath/${u._1}")
|
||||
val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true)
|
||||
val i = downloadBaselinePart(u._2)
|
||||
val buffer = Array.fill[Byte](1024)(0)
|
||||
while(i.read(buffer)>0) {
|
||||
fsDataOutputStream.write(buffer)
|
||||
}
|
||||
i.close()
|
||||
IOUtils.copy(i, fsDataOutputStream)
|
||||
println(s"Downloaded ${u._2} into $baselinePath/${u._1}")
|
||||
fsDataOutputStream.close()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] =
|
||||
new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
||||
override def zero: PMArticle = new PMArticle
|
||||
|
||||
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
||||
override def zero: PMArticle = new PMArticle
|
||||
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
|
||||
if (b != null && b.getPmid != null) b else a._2
|
||||
}
|
||||
|
||||
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
|
||||
if (b != null && b.getPmid!= null) b else a._2
|
||||
override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
|
||||
if (b1 != null && b1.getPmid != null) b1 else b2
|
||||
|
||||
}
|
||||
|
||||
override def finish(reduction: PMArticle): PMArticle = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
|
||||
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
}
|
||||
|
||||
override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
|
||||
if (b1 != null && b1.getPmid!= null) b1 else b2
|
||||
|
||||
}
|
||||
|
||||
override def finish(reduction: PMArticle): PMArticle = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
|
||||
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
@ -166,6 +169,8 @@ object SparkCreateBaselineDataFrame {
|
|||
val hdfsServerUri = parser.get("hdfsServerUri")
|
||||
log.info("hdfsServerUri: {}", targetPath)
|
||||
|
||||
val skipUpdate = parser.get("skipUpdate")
|
||||
log.info("skipUpdate: {}", skipUpdate)
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
|
@ -174,33 +179,44 @@ object SparkCreateBaselineDataFrame {
|
|||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
import spark.implicits._
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sc = spark.sparkContext
|
||||
import spark.implicits._
|
||||
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
|
||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||
|
||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
|
||||
val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
|
||||
} ))
|
||||
|
||||
ds.map(p => (p.getPmid,p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
|
||||
.agg(pmArticleAggregator.toColumn)
|
||||
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
||||
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
||||
val ds: Dataset[PMArticle] = spark.createDataset(
|
||||
k.filter(i => i._1.endsWith(".gz"))
|
||||
.flatMap(i => {
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
})
|
||||
)
|
||||
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder))
|
||||
.groupByKey(_._1)
|
||||
.agg(pmArticleAggregator.toColumn)
|
||||
.map(p => p._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/baseline_dataset")
|
||||
}
|
||||
|
||||
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
|
||||
exported_dataset
|
||||
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Result]
|
||||
.filter(p => p!= null)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
exported_dataset
|
||||
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||
.as[Oaf]
|
||||
.filter(p => p != null),
|
||||
targetPath
|
||||
)
|
||||
|
||||
}
|
||||
}
|
|
@ -1,8 +1,8 @@
|
|||
package eu.dnetlib.dhp.sx.graph.ebi
|
||||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
|
||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.HttpGet
|
||||
|
@ -14,21 +14,23 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
|
||||
object SparkDownloadEBILinks {
|
||||
|
||||
def createEBILinks(pmid:Long):EBILinkItem = {
|
||||
def createEBILinks(pmid: Long): EBILinkItem = {
|
||||
|
||||
val res = requestLinks(pmid)
|
||||
if (res!=null)
|
||||
if (res != null)
|
||||
return EBILinkItem(pmid, res)
|
||||
null
|
||||
}
|
||||
|
||||
def requestPage(url:String):String = {
|
||||
def requestPage(url: String): String = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
|
@ -39,8 +41,7 @@ object SparkDownloadEBILinks {
|
|||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
} else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
|
@ -56,61 +57,89 @@ object SparkDownloadEBILinks {
|
|||
}
|
||||
}
|
||||
|
||||
def requestLinks(PMID:Long):String = {
|
||||
def requestLinks(PMID: Long): String = {
|
||||
requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
|
||||
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val MAX_ITEM_PER_PARTITION = 20000
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
log.info(s"workingPath -> $workingPath")
|
||||
|
||||
log.info("Getting max pubmedId where the links have been requested")
|
||||
val links:Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
|
||||
val lastPMIDRequested =links.map(l => l.id).select(max("value")).first.getLong(0)
|
||||
log.info("Getting max pubmedId where the links have already requested")
|
||||
val links: Dataset[EBILinkItem] =
|
||||
spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
|
||||
val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
|
||||
|
||||
log.info("Retrieving PMID to request links")
|
||||
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
|
||||
pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
|
||||
pubmed
|
||||
.map(p => p.getPmid.toLong)
|
||||
.where(s"value > $lastPMIDRequested")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/id_to_request")
|
||||
|
||||
val pmidToReq:Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
|
||||
val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
|
||||
|
||||
val total = pmidToReq.count()
|
||||
|
||||
spark.createDataset(pmidToReq.rdd.repartition((total/MAX_ITEM_PER_PARTITION).toInt).map(pmid =>createEBILinks(pmid)).filter(l => l!= null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
|
||||
spark
|
||||
.createDataset(
|
||||
pmidToReq.rdd
|
||||
.repartition((total / MAX_ITEM_PER_PARTITION).toInt)
|
||||
.map(pmid => createEBILinks(pmid))
|
||||
.filter(l => l != null)
|
||||
)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/links_update")
|
||||
|
||||
val updates:Dataset[EBILinkItem] =spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
|
||||
val updates: Dataset[EBILinkItem] =
|
||||
spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
|
||||
|
||||
links.union(updates).groupByKey(_.id)
|
||||
.reduceGroups{(x,y) =>
|
||||
if (x == null || x.links ==null)
|
||||
links
|
||||
.union(updates)
|
||||
.groupByKey(_.id)
|
||||
.reduceGroups { (x, y) =>
|
||||
if (x == null || x.links == null)
|
||||
y
|
||||
if (y ==null || y.links ==null)
|
||||
if (y == null || y.links == null)
|
||||
x
|
||||
if (x.links.length > y.links.length)
|
||||
x
|
||||
x
|
||||
else
|
||||
y
|
||||
}.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
|
||||
}
|
||||
.map(_._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/links_final")
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue