forked from D-Net/dnet-hadoop
Compare commits
411 Commits
master
...
stable_id_
Author | SHA1 | Date |
---|---|---|
Sandro La Bruzzo | aeb8132627 | |
Sandro La Bruzzo | efbea1e01a | |
Claudio Atzori | 2039bb9f5f | |
Claudio Atzori | dd19c4ac5a | |
Claudio Atzori | e9e86a237d | |
Claudio Atzori | 10bd6ca194 | |
Claudio Atzori | a900bfb874 | |
Sandro La Bruzzo | dd997c49e0 | |
Claudio Atzori | 741077dbca | |
Miriam Baglioni | 32b0c27217 | |
Sandro La Bruzzo | 0d1f37302f | |
Miriam Baglioni | dc07f1079b | |
Miriam Baglioni | 8d2e086e48 | |
Miriam Baglioni | f33521d338 | |
Miriam Baglioni | bc12e9819e | |
Sandro La Bruzzo | 0cdb7ccdaa | |
Sandro La Bruzzo | 5b724d9972 | |
Sandro La Bruzzo | e57294ac99 | |
Michele Artini | ede2749822 | |
Michele Artini | f0fbfdcfae | |
Michele Artini | e950750262 | |
Michele Artini | 03a510859a | |
Michele Artini | e9f2b6037c | |
Sandro La Bruzzo | 02ef46535f | |
Sandro La Bruzzo | aeadc5a366 | |
Claudio Atzori | 96238152cb | |
Michele Artini | ad56a44fda | |
Claudio Atzori | 83722ebc47 | |
Claudio Atzori | eb6acfbabc | |
Claudio Atzori | 6e3a4e9237 | |
Claudio Atzori | ac3d090e9e | |
Michele Artini | 4fa5671d16 | |
Claudio Atzori | c3d92247d3 | |
Claudio Atzori | d512062b58 | |
Claudio Atzori | 5e4b91d9ef | |
Sandro La Bruzzo | bced804151 | |
Claudio Atzori | 4f58418184 | |
Claudio Atzori | 9d725efdc1 | |
Claudio Atzori | 863b56b6ce | |
Claudio Atzori | ae5c28e54f | |
Claudio Atzori | b695932ae4 | |
Claudio Atzori | 232dce83db | |
Claudio Atzori | c4a23c2f4d | |
Claudio Atzori | ba03f549d7 | |
Michele Artini | e56ccec536 | |
Michele Artini | c1e20de7cf | |
Claudio Atzori | a9f512103b | |
Claudio Atzori | 2cbf15f4fb | |
Claudio Atzori | f19feceaf0 | |
Claudio Atzori | 1bd70fa2c6 | |
Claudio Atzori | ca3f3a7687 | |
Claudio Atzori | 0358ae16ce | |
Claudio Atzori | 23b8883ab1 | |
Claudio Atzori | 609eb711b3 | |
Claudio Atzori | 1517bf7c92 | |
Sandro La Bruzzo | 6424cd9062 | |
Sandro La Bruzzo | 073dcea2aa | |
Claudio Atzori | d4c3476152 | |
Claudio Atzori | d1cbee8413 | |
Claudio Atzori | 3797543600 | |
Claudio Atzori | 8a0de2fc18 | |
Michele Artini | d82071ba6c | |
Claudio Atzori | d4a30fabe3 | |
Claudio Atzori | dccaf173cf | |
Claudio Atzori | 2e1eb96f9a | |
Claudio Atzori | b1785ba77c | |
Claudio Atzori | fb930b84d3 | |
Claudio Atzori | 923d19ea8e | |
Sandro La Bruzzo | 714b71bd21 | |
Claudio Atzori | ba86835951 | |
Claudio Atzori | c00be646f3 | |
Michele Artini | f4bd2b5619 | |
Michele Artini | 49910aedca | |
Claudio Atzori | 5cc3e6d61c | |
Michele Artini | b4877da363 | |
Alessia Bardi | 9a20057615 | |
Michele Artini | 6692128234 | |
Michele Artini | a278d67175 | |
Claudio Atzori | f6ccd54d87 | |
Claudio Atzori | 91e7220f20 | |
Michele Artini | f77ba34126 | |
Michele Artini | 7c5cd86927 | |
Michele Artini | b5cf505cc6 | |
Enrico Ottonello | c537986b7c | |
Sandro La Bruzzo | 2129e9caa7 | |
Claudio Atzori | 5afa7d3e0c | |
Claudio Atzori | ac77a245a3 | |
Claudio Atzori | f783e60ff7 | |
Sandro La Bruzzo | 63c0303137 | |
Sandro La Bruzzo | 74484d2823 | |
Claudio Atzori | 233d849f90 | |
Claudio Atzori | fcd13f5350 | |
Claudio Atzori | 4028176559 | |
Sandro La Bruzzo | c74b03d59c | |
Sandro La Bruzzo | 7f8848ecdd | |
Claudio Atzori | 27ab8a704d | |
Claudio Atzori | a7cf449b36 | |
Claudio Atzori | 82de6fb634 | |
Claudio Atzori | fa42026590 | |
Claudio Atzori | ef4bfd82e2 | |
Claudio Atzori | faa8f6f4e2 | |
miconis | 6d5c14e030 | |
Claudio Atzori | c2bb03c8b5 | |
Claudio Atzori | c25238480c | |
miconis | d0e3366c34 | |
miconis | 3c12eeadce | |
Claudio Atzori | e5abbec2ba | |
Claudio Atzori | 55964cbd81 | |
Claudio Atzori | 8f309b72ff | |
Claudio Atzori | 52244f813a | |
Sandro La Bruzzo | fd29307b84 | |
Claudio Atzori | 815b9f4d56 | |
Claudio Atzori | d0d477cca3 | |
miconis | 0393cdce42 | |
miconis | cadd0a5de8 | |
Sandro La Bruzzo | e06c7f32f6 | |
Sandro La Bruzzo | dbe0d0378e | |
Sandro La Bruzzo | 524e5f3092 | |
Sandro La Bruzzo | cdfe01bbae | |
Sandro La Bruzzo | 3ae67b7a1d | |
Sandro La Bruzzo | a16e5299f9 | |
Claudio Atzori | 45057440c1 | |
Enrico Ottonello | 34ca792a55 | |
Enrico Ottonello | 27068aacd1 | |
miconis | 7ad573d023 | |
Sandro La Bruzzo | 67085da305 | |
Sandro La Bruzzo | 644aa8f40c | |
Sandro La Bruzzo | 7d6a80e2f2 | |
Claudio Atzori | 8704d32780 | |
Claudio Atzori | ba4b4c74d8 | |
Claudio Atzori | 3d58f95522 | |
miconis | f64e57c112 | |
miconis | 176a5e493d | |
miconis | 3525a8f504 | |
Claudio Atzori | 745fa92db8 | |
Claudio Atzori | 083c2959dc | |
Sandro La Bruzzo | 3f77bfceb0 | |
Claudio Atzori | 3125cef545 | |
Sandro La Bruzzo | 44a0064df6 | |
Sandro La Bruzzo | 479abd10cb | |
Claudio Atzori | 710cd1e8f2 | |
Claudio Atzori | d1ca025b0b | |
miconis | 1542196a33 | |
miconis | 369ed1cd8a | |
Andreas Czerniak | 52fbece3b3 | |
Andreas Czerniak | d7614c1f85 | |
Andreas Czerniak | 3b694074ff | |
Claudio Atzori | 511c0521e5 | |
Claudio Atzori | 72dcadd8e6 | |
Claudio Atzori | 902d05f548 | |
miconis | d442e25cbc | |
Andreas Czerniak | 34df35926c | |
miconis | 11b22b2d23 | |
miconis | 0857100fb8 | |
miconis | bf685d849f | |
miconis | eaaefb8b4c | |
miconis | c39c82dfe9 | |
Claudio Atzori | 1e7e5180fa | |
Claudio Atzori | e686b8de8d | |
Claudio Atzori | ee34cc51c3 | |
Claudio Atzori | 70e49ed53c | |
Claudio Atzori | 7941d7be29 | |
Claudio Atzori | 879e8cc7ef | |
Claudio Atzori | 72ce741ea6 | |
Enrico Ottonello | 59ec5137e1 | |
Sandro La Bruzzo | 616d2ecce2 | |
Claudio Atzori | 27681b876c | |
Claudio Atzori | 9237d55d7f | |
Claudio Atzori | 7f4e9479ec | |
Claudio Atzori | 940556f6d3 | |
miconis | 2709d08fc2 | |
miconis | f446580e9f | |
Claudio Atzori | 3becaa5539 | |
Claudio Atzori | a0837ac357 | |
Claudio Atzori | 48f2b6127e | |
miconis | 2355cc4e9b | |
Sandro La Bruzzo | 1dfda3624e | |
Claudio Atzori | b5b7dc2104 | |
Enrico Ottonello | 91d8660982 | |
Enrico Ottonello | ebd67b8c8f | |
Claudio Atzori | 827e7e37db | |
miconis | 28c1cdd132 | |
miconis | 5dfb66b0fa | |
miconis | 348b0ef921 | |
Claudio Atzori | 751125fdf9 | |
Claudio Atzori | 1e423fdc07 | |
Claudio Atzori | e5ebb500cf | |
Claudio Atzori | b75ad76f79 | |
Claudio Atzori | 8db248aa13 | |
Sandro La Bruzzo | 625e4c29c4 | |
Claudio Atzori | b4febed138 | |
Claudio Atzori | 431cbe9955 | |
Sandro La Bruzzo | c392936b97 | |
Sandro La Bruzzo | c73072079d | |
Sandro La Bruzzo | 098914dcff | |
miconis | 0fe40b08e4 | |
miconis | 98854b0124 | |
Claudio Atzori | 5a043e95ea | |
Claudio Atzori | a4e82a65aa | |
Claudio Atzori | 3256b9c836 | |
Claudio Atzori | 75144dacb3 | |
Claudio Atzori | 9588bfba81 | |
Claudio Atzori | 972d5a3d98 | |
Sandro La Bruzzo | 25d5663d97 | |
Sandro La Bruzzo | 5f98ea74a9 | |
Sandro La Bruzzo | b4805b989d | |
Claudio Atzori | 734232d3b9 | |
Sandro La Bruzzo | 76b10090fc | |
Claudio Atzori | a3dac32f16 | |
Sandro La Bruzzo | 2be0428047 | |
Claudio Atzori | 8257f9a2bc | |
Sandro La Bruzzo | 7c97a4d900 | |
Sandro La Bruzzo | cc5bbafa5d | |
Claudio Atzori | 3b2da86f0a | |
Claudio Atzori | 640b885706 | |
Claudio Atzori | 61a2551e74 | |
Claudio Atzori | 9cac6da9bd | |
Claudio Atzori | d3cb923f24 | |
Sandro La Bruzzo | 4bb3bcafa5 | |
Sandro La Bruzzo | a8e5d0ea0d | |
Sandro La Bruzzo | f5e7c57654 | |
Claudio Atzori | f74e464942 | |
Claudio Atzori | c801ab6c1d | |
Claudio Atzori | 9917d7e01c | |
Claudio Atzori | 01630f638d | |
Claudio Atzori | b3f3b895e5 | |
Claudio Atzori | 765f9bdee7 | |
Claudio Atzori | 59532b0919 | |
Claudio Atzori | d525785497 | |
Sandro La Bruzzo | bbe1a7c69a | |
Sandro La Bruzzo | a2169ccf07 | |
Claudio Atzori | f468c7f0d7 | |
Claudio Atzori | 76441f4edd | |
Claudio Atzori | 8d2bb24512 | |
Claudio Atzori | acbe3119a4 | |
Claudio Atzori | fa7930d2e2 | |
Claudio Atzori | 55f6ff5f55 | |
Claudio Atzori | ec80b7ade3 | |
Claudio Atzori | 36f750cd1d | |
Claudio Atzori | b73dce3e3a | |
Enrico Ottonello | 20c0438f11 | |
Enrico Ottonello | 70cb100647 | |
Enrico Ottonello | bd3b16402b | |
Claudio Atzori | e76c4f62c1 | |
miconis | 1a85020572 | |
Enrico Ottonello | ca1800510a | |
Enrico Ottonello | 53d7023460 | |
Claudio Atzori | 7df2461ccc | |
Enrico Ottonello | d43ea88caf | |
Claudio Atzori | b830e33392 | |
Claudio Atzori | dc98c39500 | |
Claudio Atzori | 271e88537b | |
Claudio Atzori | 9c899f4433 | |
Claudio Atzori | fc3fa5e343 | |
Enrico Ottonello | 975823b968 | |
Claudio Atzori | e7eba9f7e7 | |
Claudio Atzori | 58467aaf1e | |
Claudio Atzori | cc88701f29 | |
Claudio Atzori | 545f8f3e48 | |
Claudio Atzori | b592d78bb4 | |
Claudio Atzori | cf27905a71 | |
Claudio Atzori | 58288a95b8 | |
Claudio Atzori | 1abe6d1ad7 | |
Claudio Atzori | 523a6bfa97 | |
Sandro La Bruzzo | 7edcc87ed4 | |
Sandro La Bruzzo | 6a37c7f175 | |
Sandro La Bruzzo | b3f5c2351d | |
Sandro La Bruzzo | f216277219 | |
Andreas Czerniak | 5a9017cf18 | |
Claudio Atzori | aa55dedb8a | |
Claudio Atzori | 29c6f7e255 | |
Sandro La Bruzzo | 17e6f1934e | |
Sandro La Bruzzo | ebcc3ec14f | |
miconis | 4b2124a18e | |
Enrico Ottonello | ee4ba7298b | |
Claudio Atzori | bae029f828 | |
Claudio Atzori | bebc54d5bf | |
Claudio Atzori | 50add4c61b | |
Claudio Atzori | 40df0f987d | |
Claudio Atzori | a8a758925e | |
Michele Artini | 2ee0c3e47e | |
Claudio Atzori | 730973679a | |
Claudio Atzori | deb85706db | |
Sandro La Bruzzo | 4dae5e605d | |
Claudio Atzori | 72c57b28fa | |
Claudio Atzori | 40764cf626 | |
Enrico Ottonello | c238561001 | |
Enrico Ottonello | 465ce39f75 | |
Sandro La Bruzzo | 69c253710b | |
Michele Artini | 3ea8c328ac | |
Michele Artini | 26d2eb946f | |
Claudio Atzori | 4758b58aa2 | |
Claudio Atzori | e04045089f | |
Michele Artini | 1b9731632b | |
Michele Artini | 820d729e99 | |
Michele Artini | 33f4696d6e | |
Michele Artini | c286d28ad2 | |
Claudio Atzori | 0e8a4f9f1a | |
Claudio Atzori | 53884d12c2 | |
Claudio Atzori | ac46c247d2 | |
Claudio Atzori | bde14b149a | |
Claudio Atzori | ca4391aa1c | |
Claudio Atzori | bb89b99b24 | |
Claudio Atzori | 75807ea5ae | |
Sandro La Bruzzo | 4ed1e306b6 | |
Sandro La Bruzzo | 0634674add | |
Claudio Atzori | d62ea1490d | |
Claudio Atzori | 73d772a4b4 | |
Claudio Atzori | 8eaa1fd4b4 | |
Sandro La Bruzzo | bead34d11a | |
Sandro La Bruzzo | 6ff234d81b | |
Sandro La Bruzzo | b6b835ef49 | |
Sandro La Bruzzo | e423634cb6 | |
Sandro La Bruzzo | 8ee82576c6 | |
Sandro La Bruzzo | 0276180039 | |
Michele Artini | d942d0c77d | |
Sandro La Bruzzo | 0f8e2ecce6 | |
Sandro La Bruzzo | 99cf3a8ea4 | |
Sandro La Bruzzo | 2da8bf7429 | |
Sandro La Bruzzo | 686e7b507c | |
Sandro La Bruzzo | 98b9498b57 | |
Michele Artini | 38f2508c87 | |
Sandro La Bruzzo | 184e7b3856 | |
Sandro La Bruzzo | 150a617bd1 | |
Sandro La Bruzzo | a54848a59c | |
Sandro La Bruzzo | ffb092b8d3 | |
Sandro La Bruzzo | cda210a2ca | |
miconis | c7e2d5a59a | |
miconis | 8fea29177c | |
miconis | 1e1aab83e3 | |
Enrico Ottonello | b2de598c1a | |
Enrico Ottonello | efe4c2a9c5 | |
Enrico Ottonello | 858efbfad1 | |
Claudio Atzori | d9532446eb | |
Claudio Atzori | 1eaad89a3c | |
Claudio Atzori | 3c10941376 | |
Claudio Atzori | 12e2f930c8 | |
Claudio Atzori | 3c5ce1dada | |
Claudio Atzori | fcd7689b50 | |
Antonis Lempesis | aead9efd24 | |
Antonis Lempesis | 77a3a6d82e | |
Antonis Lempesis | 91226117b3 | |
Antonis Lempesis | b7f29db126 | |
Antonis Lempesis | ded2392275 | |
Antonis Lempesis | 1a87a1effd | |
Enrico Ottonello | 2233750a37 | |
Claudio Atzori | 491ad24750 | |
Enrico Ottonello | 5c65e602d3 | |
Claudio Atzori | 21ddcf3a73 | |
Enrico Ottonello | fa1855a4b8 | |
Enrico Ottonello | b1b589ada1 | |
Enrico Ottonello | 8812ab65e1 | |
Enrico Ottonello | 53b22c1937 | |
Enrico Ottonello | 1b1e9ea67c | |
Claudio Atzori | 943b961cf6 | |
Claudio Atzori | 893ac4a77b | |
Claudio Atzori | 349e7246aa | |
Claudio Atzori | 2c407e775e | |
Claudio Atzori | 758d27745d | |
Claudio Atzori | 596a2a459d | |
Claudio Atzori | fa66e5b6b8 | |
Claudio Atzori | 5151850a19 | |
Claudio Atzori | d0d5525d40 | |
Claudio Atzori | 13eae4b31e | |
Claudio Atzori | 76363a8512 | |
Claudio Atzori | c1b9a4045a | |
Claudio Atzori | 1372a4d1bf | |
Claudio Atzori | e208b03755 | |
Claudio Atzori | dfd6205b95 | |
Claudio Atzori | 36173c13a5 | |
Claudio Atzori | e1a1bb3ee4 | |
Claudio Atzori | 33bae02451 | |
Claudio Atzori | e43ab07af6 | |
Claudio Atzori | c016cc050a | |
Claudio Atzori | fcbb05eb21 | |
Claudio Atzori | 3f34757c63 | |
Claudio Atzori | 9b0fb9e958 | |
Sandro La Bruzzo | 8e1d43aab2 | |
Claudio Atzori | 2d76497488 | |
Claudio Atzori | e5da4ee9b1 | |
Claudio Atzori | ea2a0ea949 | |
Claudio Atzori | 86d6fbe95b | |
Claudio Atzori | 8471888ad3 | |
Claudio Atzori | 3fcd669e99 | |
Claudio Atzori | 78c3c1b62b | |
Claudio Atzori | 8e7f81c5f5 | |
Claudio Atzori | 09e44dabff | |
Claudio Atzori | 385214eeae | |
Claudio Atzori | 04ad8969b2 | |
Claudio Atzori | 4ca75d6951 | |
Claudio Atzori | 58f28296ea | |
miconis | c4a59d1b9a | |
miconis | 708d887e64 | |
miconis | 0e54803177 | |
Claudio Atzori | 266bf1a221 | |
Claudio Atzori | 34f1d0904b | |
Claudio Atzori | c188868450 | |
Claudio Atzori | 3e6c8bca39 | |
miconis | 6f8720982c | |
Claudio Atzori | 8958f20813 | |
Claudio Atzori | 1abcabb6e6 | |
miconis | 1804c5d809 | |
miconis | 7093355487 | |
Claudio Atzori | 642b459552 | |
Claudio Atzori | 6ce340bd3d | |
miconis | a2ac7e52fb | |
miconis | e3f7798d1b | |
miconis | 4cf79f32eb | |
miconis | 259362ef47 | |
miconis | d47352cbc7 | |
miconis | b260fee787 |
|
@ -7,6 +7,8 @@
|
|||
*.iws
|
||||
*~
|
||||
.vscode
|
||||
.metals
|
||||
.bloop
|
||||
.classpath
|
||||
/*/.classpath
|
||||
/*/*/.classpath
|
||||
|
@ -24,4 +26,5 @@
|
|||
spark-warehouse
|
||||
/**/job-override.properties
|
||||
/**/*.log
|
||||
/**/.factorypath
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-common</artifactId>
|
||||
|
@ -20,6 +21,10 @@
|
|||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.sisyphsu</groupId>
|
||||
<artifactId>dateparser</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
|
@ -53,11 +58,6 @@
|
|||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
|
||||
<dependency>
|
||||
<groupId>com.rabbitmq</groupId>
|
||||
<artifactId>amqp-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
|
@ -98,6 +98,16 @@
|
|||
<artifactId>dnet-pace-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.application;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
public class ApplicationUtils {
|
||||
|
||||
}
|
|
@ -1,10 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.application;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.Serializable;
|
||||
import java.io.StringWriter;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
@ -12,17 +9,21 @@ import java.util.zip.GZIPOutputStream;
|
|||
import org.apache.commons.cli.*;
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class ArgumentApplicationParser implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
|
||||
|
||||
private final Options options = new Options();
|
||||
private final Map<String, String> objectMap = new HashMap<>();
|
||||
|
||||
private final List<String> compressedValues = new ArrayList<>();
|
||||
|
||||
public ArgumentApplicationParser(final String json_configuration) throws Exception {
|
||||
public ArgumentApplicationParser(final String json_configuration) throws IOException {
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
|
||||
createOptionMap(configuration);
|
||||
|
@ -33,7 +34,6 @@ public class ArgumentApplicationParser implements Serializable {
|
|||
}
|
||||
|
||||
private void createOptionMap(final OptionsParameter[] configuration) {
|
||||
|
||||
Arrays
|
||||
.stream(configuration)
|
||||
.map(
|
||||
|
@ -47,10 +47,6 @@ public class ArgumentApplicationParser implements Serializable {
|
|||
return o;
|
||||
})
|
||||
.forEach(options::addOption);
|
||||
|
||||
// HelpFormatter formatter = new HelpFormatter();
|
||||
// formatter.printHelp("myapp", null, options, null, true);
|
||||
|
||||
}
|
||||
|
||||
public static String decompressValue(final String abstractCompressed) {
|
||||
|
@ -61,7 +57,7 @@ public class ArgumentApplicationParser implements Serializable {
|
|||
IOUtils.copy(gis, stringWriter);
|
||||
return stringWriter.toString();
|
||||
} catch (Throwable e) {
|
||||
System.out.println("Wrong value to decompress:" + abstractCompressed);
|
||||
log.error("Wrong value to decompress:" + abstractCompressed);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
@ -74,7 +70,7 @@ public class ArgumentApplicationParser implements Serializable {
|
|||
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
|
||||
}
|
||||
|
||||
public void parseArgument(final String[] args) throws Exception {
|
||||
public void parseArgument(final String[] args) throws ParseException {
|
||||
CommandLineParser parser = new BasicParser();
|
||||
CommandLine cmd = parser.parse(options, args);
|
||||
Arrays
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.collector.worker.model;
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
|
@ -27,4 +27,26 @@ public class Constants {
|
|||
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
|
||||
}
|
||||
|
||||
public static final String SEQUENCE_FILE_NAME = "/sequence_file";
|
||||
public static final String REPORT_FILE_NAME = "/report";
|
||||
public static final String MDSTORE_DATA_PATH = "/store";
|
||||
public static final String MDSTORE_SIZE_PATH = "/size";
|
||||
|
||||
public static final String COLLECTION_MODE = "collectionMode";
|
||||
public static final String METADATA_ENCODING = "metadataEncoding";
|
||||
public static final String OOZIE_WF_PATH = "oozieWfPath";
|
||||
public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL";
|
||||
|
||||
public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry";
|
||||
public static final String REQUEST_DELAY = "requestDelay";
|
||||
public static final String RETRY_DELAY = "retryDelay";
|
||||
public static final String CONNECT_TIMEOUT = "connectTimeOut";
|
||||
public static final String READ_TIMEOUT = "readTimeOut";
|
||||
public static final String FROM_DATE_OVERRIDE = "fromDateOverride";
|
||||
public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride";
|
||||
|
||||
public static final String CONTENT_TOTALITEMS = "TotalItems";
|
||||
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
|
||||
public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
|
||||
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ public class DbClient implements Closeable {
|
|||
|
||||
private static final Log log = LogFactory.getLog(DbClient.class);
|
||||
|
||||
private Connection connection;
|
||||
private final Connection connection;
|
||||
|
||||
public DbClient(final String address, final String login, final String password) {
|
||||
|
||||
|
|
|
@ -100,7 +100,7 @@ public class MakeTarArchive implements Serializable {
|
|||
BufferedInputStream bis = new BufferedInputStream(is);
|
||||
|
||||
int count;
|
||||
byte data[] = new byte[1024];
|
||||
byte[] data = new byte[1024];
|
||||
while ((count = bis.read(data, 0, data.length)) != -1) {
|
||||
ar.write(data, 0, count);
|
||||
}
|
||||
|
|
|
@ -1,39 +1,60 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.bson.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.mongodb.BasicDBObject;
|
||||
import com.mongodb.MongoClient;
|
||||
import com.mongodb.MongoClientURI;
|
||||
import com.mongodb.QueryBuilder;
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
|
||||
public class MdstoreClient implements Closeable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MdstoreClient.class);
|
||||
|
||||
private final MongoClient client;
|
||||
private final MongoDatabase db;
|
||||
|
||||
private static final String COLL_METADATA = "metadata";
|
||||
private static final String COLL_METADATA_MANAGER = "metadataManager";
|
||||
|
||||
private static final Log log = LogFactory.getLog(MdstoreClient.class);
|
||||
|
||||
public MdstoreClient(final String baseUrl, final String dbName) {
|
||||
this.client = new MongoClient(new MongoClientURI(baseUrl));
|
||||
this.db = getDb(client, dbName);
|
||||
}
|
||||
|
||||
public MongoCollection<Document> mdStore(final String mdId) {
|
||||
BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
|
||||
|
||||
log.info("querying current mdId: {}", query.toJson());
|
||||
|
||||
final String currentId = Optional
|
||||
.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query))
|
||||
.map(r -> r.first())
|
||||
.map(d -> d.getString("currentId"))
|
||||
.orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId));
|
||||
|
||||
log.info("currentId: {}", currentId);
|
||||
|
||||
return getColl(db, currentId, true);
|
||||
}
|
||||
|
||||
public Map<String, String> validCollections(
|
||||
final String mdFormat, final String mdLayout, final String mdInterpretation) {
|
||||
|
|
@ -13,9 +13,9 @@ import okio.Source;
|
|||
|
||||
public class InputStreamRequestBody extends RequestBody {
|
||||
|
||||
private InputStream inputStream;
|
||||
private MediaType mediaType;
|
||||
private long lenght;
|
||||
private final InputStream inputStream;
|
||||
private final MediaType mediaType;
|
||||
private final long lenght;
|
||||
|
||||
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
|
||||
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.rest;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.client.methods.HttpPost;
|
||||
import org.apache.http.client.methods.HttpUriRequest;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class DNetRestClient {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
|
||||
|
||||
private static final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
|
||||
final HttpGet httpGet = new HttpGet(url);
|
||||
return doHTTPRequest(httpGet, clazz);
|
||||
}
|
||||
|
||||
public static String doGET(final String url) throws Exception {
|
||||
final HttpGet httpGet = new HttpGet(url);
|
||||
return doHTTPRequest(httpGet);
|
||||
}
|
||||
|
||||
public static <V> String doPOST(final String url, V objParam) throws Exception {
|
||||
final HttpPost httpPost = new HttpPost(url);
|
||||
|
||||
if (objParam != null) {
|
||||
final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam));
|
||||
httpPost.setEntity(entity);
|
||||
httpPost.setHeader("Accept", "application/json");
|
||||
httpPost.setHeader("Content-type", "application/json");
|
||||
}
|
||||
return doHTTPRequest(httpPost);
|
||||
}
|
||||
|
||||
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws Exception {
|
||||
return mapper.readValue(doPOST(url, objParam), clazz);
|
||||
}
|
||||
|
||||
private static String doHTTPRequest(final HttpUriRequest r) throws Exception {
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
|
||||
log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString());
|
||||
log
|
||||
.info(
|
||||
"request headers: {}",
|
||||
Arrays
|
||||
.asList(r.getAllHeaders())
|
||||
.stream()
|
||||
.map(h -> h.getName() + ":" + h.getValue())
|
||||
.collect(Collectors.joining(",")));
|
||||
|
||||
CloseableHttpResponse response = client.execute(r);
|
||||
return IOUtils.toString(response.getEntity().getContent());
|
||||
}
|
||||
|
||||
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {
|
||||
return mapper.readValue(doHTTPRequest(r), clazz);
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
package eu.dnetlib.dhp.common.vocabulary;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
|
@ -10,8 +10,8 @@ import org.apache.commons.lang3.StringUtils;
|
|||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
public class Vocabulary implements Serializable {
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
package eu.dnetlib.dhp.common.vocabulary;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
@ -7,8 +7,8 @@ import java.util.stream.Collectors;
|
|||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
|
@ -67,6 +67,10 @@ public class VocabularyGroup implements Serializable {
|
|||
|
||||
private final Map<String, Vocabulary> vocs = new HashMap<>();
|
||||
|
||||
public Set<String> vocabularyNames() {
|
||||
return vocs.keySet();
|
||||
}
|
||||
|
||||
public void addVocabulary(final String id, final String name) {
|
||||
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
|
||||
}
|
||||
|
@ -118,7 +122,31 @@ public class VocabularyGroup implements Serializable {
|
|||
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
|
||||
}
|
||||
|
||||
/**
|
||||
* getSynonymAsQualifierCaseSensitive
|
||||
*
|
||||
* refelects the situation to check caseSensitive vocabulary
|
||||
*/
|
||||
public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) {
|
||||
if (StringUtils.isBlank(vocId)) {
|
||||
return OafMapperUtils.unknown("", "");
|
||||
}
|
||||
return vocs.get(vocId).getSynonymAsQualifier(syn);
|
||||
}
|
||||
|
||||
/**
|
||||
* termExists
|
||||
*
|
||||
* two methods: without and with caseSensitive check
|
||||
*/
|
||||
public boolean termExists(final String vocId, final String id) {
|
||||
return termExists(vocId, id, Boolean.FALSE);
|
||||
}
|
||||
|
||||
public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) {
|
||||
if (Boolean.TRUE.equals(caseSensitive)) {
|
||||
return vocabularyExists(vocId) && vocs.get(vocId).termExists(id);
|
||||
}
|
||||
return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id);
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
package eu.dnetlib.dhp.common.vocabulary;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
|
||||
package eu.dnetlib.dhp.message;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class Message implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 401753881204524893L;
|
||||
|
||||
public static String CURRENT_PARAM = "current";
|
||||
public static String TOTAL_PARAM = "total";
|
||||
|
||||
private MessageType messageType;
|
||||
|
||||
private String workflowId;
|
||||
|
||||
private Map<String, String> body;
|
||||
|
||||
public Message() {
|
||||
}
|
||||
|
||||
public Message(final MessageType messageType, final String workflowId) {
|
||||
this(messageType, workflowId, new LinkedHashMap<>());
|
||||
}
|
||||
|
||||
public Message(final MessageType messageType, final String workflowId, final Map<String, String> body) {
|
||||
this.messageType = messageType;
|
||||
this.workflowId = workflowId;
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
public MessageType getMessageType() {
|
||||
return messageType;
|
||||
}
|
||||
|
||||
public void setMessageType(MessageType messageType) {
|
||||
this.messageType = messageType;
|
||||
}
|
||||
|
||||
public String getWorkflowId() {
|
||||
return workflowId;
|
||||
}
|
||||
|
||||
public void setWorkflowId(final String workflowId) {
|
||||
this.workflowId = workflowId;
|
||||
}
|
||||
|
||||
public Map<String, String> getBody() {
|
||||
return body;
|
||||
}
|
||||
|
||||
public void setBody(final Map<String, String> body) {
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
|
||||
package eu.dnetlib.dhp.message;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
import org.apache.http.client.config.RequestConfig;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpPut;
|
||||
import org.apache.http.entity.ContentType;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class MessageSender {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MessageSender.class);
|
||||
|
||||
private static final int SOCKET_TIMEOUT_MS = 2000;
|
||||
|
||||
private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000;
|
||||
|
||||
private static final int CONNTECTION_TIMEOUT_MS = 2000;
|
||||
|
||||
private final ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
private final String dnetMessageEndpoint;
|
||||
|
||||
private final String workflowId;
|
||||
|
||||
private final ExecutorService executorService = Executors.newCachedThreadPool();
|
||||
|
||||
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
|
||||
this.workflowId = workflowId;
|
||||
this.dnetMessageEndpoint = dnetMessageEndpoint;
|
||||
}
|
||||
|
||||
public void sendMessage(final Message message) {
|
||||
executorService.submit(() -> _sendMessage(message));
|
||||
}
|
||||
|
||||
public void sendMessage(final Long current, final Long total) {
|
||||
sendMessage(createOngoingMessage(current, total));
|
||||
}
|
||||
|
||||
public void sendReport(final Map<String, String> report) {
|
||||
sendMessage(new Message(MessageType.REPORT, workflowId, report));
|
||||
}
|
||||
|
||||
private Message createOngoingMessage(final Long current, final Long total) {
|
||||
final Message m = new Message(MessageType.ONGOING, workflowId);
|
||||
m.getBody().put(Message.CURRENT_PARAM, current.toString());
|
||||
if (total != null) {
|
||||
m.getBody().put(Message.TOTAL_PARAM, total.toString());
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
private void _sendMessage(final Message message) {
|
||||
try {
|
||||
final String json = objectMapper.writeValueAsString(message);
|
||||
|
||||
final HttpPut req = new HttpPut(dnetMessageEndpoint);
|
||||
req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON));
|
||||
|
||||
final RequestConfig requestConfig = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(CONNTECTION_TIMEOUT_MS)
|
||||
.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS)
|
||||
.setSocketTimeout(SOCKET_TIMEOUT_MS)
|
||||
.build();
|
||||
|
||||
try (final CloseableHttpClient client = HttpClients
|
||||
.custom()
|
||||
.setDefaultRequestConfig(requestConfig)
|
||||
.build();
|
||||
final CloseableHttpResponse response = client.execute(req)) {
|
||||
log.debug("Sent Message to " + dnetMessageEndpoint);
|
||||
log.debug("MESSAGE:" + message);
|
||||
} catch (final Throwable e) {
|
||||
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
|
||||
}
|
||||
} catch (final JsonProcessingException e) {
|
||||
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
|
||||
package eu.dnetlib.dhp.message;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public enum MessageType implements Serializable {
|
||||
|
||||
ONGOING, REPORT;
|
||||
|
||||
public MessageType from(String value) {
|
||||
return Optional
|
||||
.ofNullable(value)
|
||||
.map(StringUtils::upperCase)
|
||||
.map(MessageType::valueOf)
|
||||
.orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,121 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.model.mdstore;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
/** This class models a record inside the new Metadata store collection on HDFS * */
|
||||
public class MetadataRecord implements Serializable {
|
||||
|
||||
/** The D-Net Identifier associated to the record */
|
||||
private String id;
|
||||
|
||||
/** The original Identifier of the record */
|
||||
private String originalId;
|
||||
|
||||
/** The encoding of the record, should be JSON or XML */
|
||||
private String encoding;
|
||||
|
||||
/**
|
||||
* The information about the provenance of the record see @{@link Provenance} for the model of this information
|
||||
*/
|
||||
private Provenance provenance;
|
||||
|
||||
/** The content of the metadata */
|
||||
private String body;
|
||||
|
||||
/** the date when the record has been stored */
|
||||
private long dateOfCollection;
|
||||
|
||||
/** the date when the record has been stored */
|
||||
private long dateOfTransformation;
|
||||
|
||||
public MetadataRecord() {
|
||||
this.dateOfCollection = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
public MetadataRecord(
|
||||
String originalId,
|
||||
String encoding,
|
||||
Provenance provenance,
|
||||
String body,
|
||||
long dateOfCollection) {
|
||||
|
||||
this.originalId = originalId;
|
||||
this.encoding = encoding;
|
||||
this.provenance = provenance;
|
||||
this.body = body;
|
||||
this.dateOfCollection = dateOfCollection;
|
||||
this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix());
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getOriginalId() {
|
||||
return originalId;
|
||||
}
|
||||
|
||||
public void setOriginalId(String originalId) {
|
||||
this.originalId = originalId;
|
||||
}
|
||||
|
||||
public String getEncoding() {
|
||||
return encoding;
|
||||
}
|
||||
|
||||
public void setEncoding(String encoding) {
|
||||
this.encoding = encoding;
|
||||
}
|
||||
|
||||
public Provenance getProvenance() {
|
||||
return provenance;
|
||||
}
|
||||
|
||||
public void setProvenance(Provenance provenance) {
|
||||
this.provenance = provenance;
|
||||
}
|
||||
|
||||
public String getBody() {
|
||||
return body;
|
||||
}
|
||||
|
||||
public void setBody(String body) {
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
public long getDateOfCollection() {
|
||||
return dateOfCollection;
|
||||
}
|
||||
|
||||
public void setDateOfCollection(long dateOfCollection) {
|
||||
this.dateOfCollection = dateOfCollection;
|
||||
}
|
||||
|
||||
public long getDateOfTransformation() {
|
||||
return dateOfTransformation;
|
||||
}
|
||||
|
||||
public void setDateOfTransformation(long dateOfTransformation) {
|
||||
this.dateOfTransformation = dateOfTransformation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof MetadataRecord)) {
|
||||
return false;
|
||||
}
|
||||
return ((MetadataRecord) o).getId().equalsIgnoreCase(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return id.hashCode();
|
||||
}
|
||||
}
|
|
@ -1,52 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.model.mdstore;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author Sandro La Bruzzo
|
||||
* <p>
|
||||
* Provenace class models the provenance of the record in the metadataStore It contains the identifier and the
|
||||
* name of the datasource that gives the record
|
||||
*/
|
||||
public class Provenance implements Serializable {
|
||||
|
||||
private String datasourceId;
|
||||
|
||||
private String datasourceName;
|
||||
|
||||
private String nsPrefix;
|
||||
|
||||
public Provenance() {
|
||||
}
|
||||
|
||||
public Provenance(String datasourceId, String datasourceName, String nsPrefix) {
|
||||
this.datasourceId = datasourceId;
|
||||
this.datasourceName = datasourceName;
|
||||
this.nsPrefix = nsPrefix;
|
||||
}
|
||||
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
public void setDatasourceId(String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
public String getDatasourceName() {
|
||||
return datasourceName;
|
||||
}
|
||||
|
||||
public void setDatasourceName(String datasourceName) {
|
||||
this.datasourceName = datasourceName;
|
||||
}
|
||||
|
||||
public String getNsPrefix() {
|
||||
return nsPrefix;
|
||||
}
|
||||
|
||||
public void setNsPrefix(String nsPrefix) {
|
||||
this.nsPrefix = nsPrefix;
|
||||
}
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
||||
public class ResultTypeComparator implements Comparator<Result> {
|
||||
|
||||
@Override
|
||||
public int compare(Result left, Result right) {
|
||||
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return 1;
|
||||
if (right == null)
|
||||
return -1;
|
||||
|
||||
String lClass = left.getResulttype().getClassid();
|
||||
String rClass = right.getResulttype().getClassid();
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
}
|
|
@ -1,33 +1,37 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.format.DateTimeParseException;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import com.clearspring.analytics.util.Lists;
|
||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class CleaningFunctions {
|
||||
|
||||
public static final String DOI_PREFIX_REGEX = "^10\\.";
|
||||
public class GraphCleaningFunctions extends CleaningFunctions {
|
||||
|
||||
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
||||
public static final int ORCID_LEN = 19;
|
||||
|
||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||
|
||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||
|
||||
static {
|
||||
PID_BLACKLIST.add("none");
|
||||
PID_BLACKLIST.add("na");
|
||||
}
|
||||
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
||||
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
|
||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
|
||||
|
||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
|
@ -59,23 +63,17 @@ public class CleaningFunctions {
|
|||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
r
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(a -> {
|
||||
if (Objects.nonNull(a.getPid())) {
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES));
|
||||
}
|
||||
});
|
||||
r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> {
|
||||
if (Objects.nonNull(a.getPid())) {
|
||||
a.getPid().stream().filter(Objects::nonNull).forEach(p -> {
|
||||
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
||||
} else if (value instanceof Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
|
@ -87,7 +85,37 @@ public class CleaningFunctions {
|
|||
return value;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T fixDefaults(T value) {
|
||||
public static <T extends Oaf> boolean filter(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Organization) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
|
||||
if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T cleanup(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
|
@ -98,10 +126,44 @@ public class CleaningFunctions {
|
|||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
Relation r = (Relation) value;
|
||||
|
||||
Optional<String> validationDate = doCleanDate(r.getValidationDate());
|
||||
if (validationDate.isPresent()) {
|
||||
r.setValidationDate(validationDate.get());
|
||||
r.setValidated(true);
|
||||
} else {
|
||||
r.setValidationDate(null);
|
||||
r.setValidated(false);
|
||||
}
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
|
||||
if (Objects.nonNull(r.getDateofacceptance())) {
|
||||
Optional<String> date = cleanDateField(r.getDateofacceptance());
|
||||
if (date.isPresent()) {
|
||||
r.getDateofacceptance().setValue(date.get());
|
||||
} else {
|
||||
r.setDateofacceptance(null);
|
||||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getRelevantdate())) {
|
||||
r
|
||||
.setRelevantdate(
|
||||
r
|
||||
.getRelevantdate()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(sp -> {
|
||||
sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue()));
|
||||
return sp;
|
||||
})
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
|
||||
r.setPublisher(null);
|
||||
}
|
||||
|
@ -110,16 +172,6 @@ public class CleaningFunctions {
|
|||
.setLanguage(
|
||||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
||||
}
|
||||
if (Objects.nonNull(r.getCountry())) {
|
||||
r
|
||||
.setCountry(
|
||||
r
|
||||
.getCountry()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(c -> StringUtils.isNotBlank(c.getClassid()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r
|
||||
.setSubject(
|
||||
|
@ -130,7 +182,7 @@ public class CleaningFunctions {
|
|||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(CleaningFunctions::cleanValue)
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getTitle())) {
|
||||
|
@ -141,7 +193,13 @@ public class CleaningFunctions {
|
|||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.map(CleaningFunctions::cleanValue)
|
||||
.filter(
|
||||
sp -> sp
|
||||
.getValue()
|
||||
.toLowerCase()
|
||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH)
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getDescription())) {
|
||||
|
@ -152,22 +210,11 @@ public class CleaningFunctions {
|
|||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.map(CleaningFunctions::cleanValue)
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getPid())) {
|
||||
r
|
||||
.setPid(
|
||||
r
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
||||
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.collect(Collectors.toList()));
|
||||
r.setPid(processPidCleaning(r.getPid()));
|
||||
}
|
||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||
r
|
||||
|
@ -175,11 +222,36 @@ public class CleaningFunctions {
|
|||
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||
}
|
||||
if (Objects.nonNull(r.getInstance())) {
|
||||
|
||||
for (Instance i : r.getInstance()) {
|
||||
Optional
|
||||
.ofNullable(i.getPid())
|
||||
.ifPresent(pid -> {
|
||||
final Set<StructuredProperty> pids = pid
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
Optional
|
||||
.ofNullable(i.getAlternateIdentifier())
|
||||
.ifPresent(altId -> {
|
||||
final Set<StructuredProperty> altIds = altId
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
|
||||
});
|
||||
});
|
||||
|
||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||
i
|
||||
.setAccessright(
|
||||
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
accessRight(
|
||||
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES));
|
||||
}
|
||||
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
||||
|
@ -187,14 +259,24 @@ public class CleaningFunctions {
|
|||
if (Objects.isNull(i.getRefereed())) {
|
||||
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
||||
}
|
||||
if (Objects.nonNull(i.getDateofacceptance())) {
|
||||
Optional<String> date = cleanDateField(i.getDateofacceptance());
|
||||
if (date.isPresent()) {
|
||||
i.getDateofacceptance().setValue(date.get());
|
||||
} else {
|
||||
i.setDateofacceptance(null);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance());
|
||||
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
|
||||
if (Objects.isNull(bestaccessrights)) {
|
||||
r
|
||||
.setBestaccessright(
|
||||
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
qualifier(
|
||||
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES));
|
||||
} else {
|
||||
r.setBestaccessright(bestaccessrights);
|
||||
}
|
||||
|
@ -280,11 +362,10 @@ public class CleaningFunctions {
|
|||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
||||
} else if (value instanceof Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
|
@ -296,6 +377,79 @@ public class CleaningFunctions {
|
|||
return value;
|
||||
}
|
||||
|
||||
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
|
||||
return Optional
|
||||
.ofNullable(dateofacceptance)
|
||||
.map(Field::getValue)
|
||||
.map(GraphCleaningFunctions::cleanDate)
|
||||
.filter(Objects::nonNull);
|
||||
}
|
||||
|
||||
protected static Optional<String> doCleanDate(String date) {
|
||||
return Optional.ofNullable(cleanDate(date));
|
||||
}
|
||||
|
||||
public static String cleanDate(final String inputDate) {
|
||||
|
||||
if (StringUtils.isBlank(inputDate)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
final LocalDate date = DateParserUtils
|
||||
.parseDate(inputDate.trim())
|
||||
.toInstant()
|
||||
.atZone(ZoneId.systemDefault())
|
||||
.toLocalDate();
|
||||
return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date);
|
||||
} catch (DateTimeParseException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private static boolean isValidAuthorName(Author a) {
|
||||
return !Stream
|
||||
.of(a.getFullname(), a.getName(), a.getSurname())
|
||||
.filter(s -> s != null && !s.isEmpty())
|
||||
.collect(Collectors.joining(""))
|
||||
.toLowerCase()
|
||||
.matches(INVALID_AUTHOR_REGEX);
|
||||
}
|
||||
|
||||
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
|
||||
return pids
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
||||
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.filter(CleaningFunctions::pidFilter)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static void fixVocabName(Qualifier q, String vocabularyName) {
|
||||
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
|
||||
q.setSchemeid(vocabularyName);
|
||||
q.setSchemename(vocabularyName);
|
||||
}
|
||||
}
|
||||
|
||||
private static AccessRight accessRight(String classid, String classname, String scheme) {
|
||||
return OafMapperUtils
|
||||
.accessRight(
|
||||
classid, classname, scheme, scheme);
|
||||
}
|
||||
|
||||
private static Qualifier qualifier(String classid, String classname, String scheme) {
|
||||
return OafMapperUtils
|
||||
.qualifier(
|
||||
classid, classname, scheme, scheme);
|
||||
}
|
||||
|
||||
protected static StructuredProperty cleanValue(StructuredProperty s) {
|
||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||
return s;
|
||||
|
@ -306,39 +460,4 @@ public class CleaningFunctions {
|
|||
return s;
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private static void fixVocabName(Qualifier q, String vocabularyName) {
|
||||
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
|
||||
q.setSchemeid(vocabularyName);
|
||||
q.setSchemename(vocabularyName);
|
||||
}
|
||||
}
|
||||
|
||||
private static Qualifier qualifier(String classid, String classname, String scheme) {
|
||||
return OafMapperUtils
|
||||
.qualifier(
|
||||
classid, classname, scheme, scheme);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method that normalises PID values on a per-type basis.
|
||||
* @param pid the PID whose value will be normalised.
|
||||
* @return the PID containing the normalised value.
|
||||
*/
|
||||
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
||||
String value = Optional
|
||||
.ofNullable(pid.getValue())
|
||||
.map(String::trim)
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||
switch (pid.getQualifier().getClassid()) {
|
||||
|
||||
// TODO add cleaning for more PID types as needed
|
||||
case "doi":
|
||||
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
|
||||
break;
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,11 +1,9 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
|
@ -13,42 +11,45 @@ import java.util.stream.Collectors;
|
|||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class OafMapperUtils {
|
||||
|
||||
public static Oaf merge(final Oaf o1, final Oaf o2) {
|
||||
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
|
||||
if (ModelSupport.isSubClass(o1, Result.class)) {
|
||||
|
||||
return mergeResults((Result) o1, (Result) o2);
|
||||
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
|
||||
((Datasource) o1).mergeFrom((Datasource) o2);
|
||||
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
|
||||
((Organization) o1).mergeFrom((Organization) o2);
|
||||
} else if (ModelSupport.isSubClass(o1, Project.class)) {
|
||||
((Project) o1).mergeFrom((Project) o2);
|
||||
} else {
|
||||
throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
|
||||
}
|
||||
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
|
||||
((Relation) o1).mergeFrom((Relation) o2);
|
||||
public static Oaf merge(final Oaf left, final Oaf right) {
|
||||
if (ModelSupport.isSubClass(left, OafEntity.class)) {
|
||||
return mergeEntities((OafEntity) left, (OafEntity) right);
|
||||
} else if (ModelSupport.isSubClass(left, Relation.class)) {
|
||||
((Relation) left).mergeFrom((Relation) right);
|
||||
} else {
|
||||
throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName());
|
||||
throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
|
||||
}
|
||||
return o1;
|
||||
return left;
|
||||
}
|
||||
|
||||
public static Result mergeResults(Result r1, Result r2) {
|
||||
if (new ResultTypeComparator().compare(r1, r2) < 0) {
|
||||
r1.mergeFrom(r2);
|
||||
return r1;
|
||||
public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
|
||||
if (ModelSupport.isSubClass(left, Result.class)) {
|
||||
return mergeResults((Result) left, (Result) right);
|
||||
} else if (ModelSupport.isSubClass(left, Datasource.class)) {
|
||||
left.mergeFrom(right);
|
||||
} else if (ModelSupport.isSubClass(left, Organization.class)) {
|
||||
left.mergeFrom(right);
|
||||
} else if (ModelSupport.isSubClass(left, Project.class)) {
|
||||
left.mergeFrom(right);
|
||||
} else {
|
||||
r2.mergeFrom(r1);
|
||||
return r2;
|
||||
throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
public static Result mergeResults(Result left, Result right) {
|
||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
||||
left.mergeFrom(right);
|
||||
return left;
|
||||
} else {
|
||||
right.mergeFrom(left);
|
||||
return right;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -104,6 +105,29 @@ public class OafMapperUtils {
|
|||
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
|
||||
}
|
||||
|
||||
public static AccessRight accessRight(
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename) {
|
||||
return accessRight(classid, classname, schemeid, schemename, null);
|
||||
}
|
||||
|
||||
public static AccessRight accessRight(
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename,
|
||||
final OpenAccessRoute openAccessRoute) {
|
||||
final AccessRight accessRight = new AccessRight();
|
||||
accessRight.setClassid(classid);
|
||||
accessRight.setClassname(classname);
|
||||
accessRight.setSchemeid(schemeid);
|
||||
accessRight.setSchemename(schemename);
|
||||
accessRight.setOpenAccessRoute(openAccessRoute);
|
||||
return accessRight;
|
||||
}
|
||||
|
||||
public static Qualifier qualifier(
|
||||
final String classid,
|
||||
final String classname,
|
||||
|
@ -117,6 +141,15 @@ public class OafMapperUtils {
|
|||
return q;
|
||||
}
|
||||
|
||||
public static Qualifier qualifier(final Qualifier qualifier) {
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(qualifier.getClassid());
|
||||
q.setClassname(qualifier.getClassname());
|
||||
q.setSchemeid(qualifier.getSchemeid());
|
||||
q.setSchemename(qualifier.getSchemename());
|
||||
return q;
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(
|
||||
final String value,
|
||||
final String classid,
|
||||
|
@ -267,7 +300,7 @@ public class OafMapperUtils {
|
|||
} else if (to_md5) {
|
||||
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
|
||||
final String rest = StringUtils.substringAfter(originalId, "::");
|
||||
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
|
||||
return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest));
|
||||
} else {
|
||||
return String.format("%s|%s", prefix, originalId);
|
||||
}
|
||||
|
@ -300,4 +333,36 @@ public class OafMapperUtils {
|
|||
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
|
||||
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
|
||||
}
|
||||
|
||||
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
|
||||
return getBestAccessRights(instanceList);
|
||||
}
|
||||
|
||||
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
|
||||
if (instanceList != null) {
|
||||
final Optional<AccessRight> min = instanceList
|
||||
.stream()
|
||||
.map(i -> i.getAccessright())
|
||||
.min(new AccessRightComparator<>());
|
||||
|
||||
final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier();
|
||||
|
||||
if (StringUtils.isBlank(rights.getClassid())) {
|
||||
rights.setClassid(UNKNOWN);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getClassname())
|
||||
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
||||
rights.setClassname(NOT_AVAILABLE);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemeid())) {
|
||||
rights.setSchemeid(DNET_ACCESS_MODES);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemename())) {
|
||||
rights.setSchemename(DNET_ACCESS_MODES);
|
||||
}
|
||||
|
||||
return rights;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -1,18 +1,29 @@
|
|||
|
||||
package eu.dnetlib.dhp.utils;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.security.MessageDigest;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.codec.binary.Base64OutputStream;
|
||||
import org.apache.commons.codec.binary.Hex;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
import net.minidev.json.JSONArray;
|
||||
|
@ -21,6 +32,8 @@ import scala.collection.Seq;
|
|||
|
||||
public class DHPUtils {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
|
||||
|
||||
public static Seq<String> toSeq(List<String> list) {
|
||||
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
|
||||
}
|
||||
|
@ -79,4 +92,72 @@ public class DHPUtils {
|
|||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
public static void writeHdfsFile(final Configuration conf, final String content, final String path)
|
||||
throws IOException {
|
||||
|
||||
log.info("writing file {}, size {}", path, content.length());
|
||||
try (FileSystem fs = FileSystem.get(conf);
|
||||
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
|
||||
os.write(content.getBytes(StandardCharsets.UTF_8));
|
||||
os.flush();
|
||||
}
|
||||
}
|
||||
|
||||
public static String readHdfsFile(Configuration conf, String path) throws IOException {
|
||||
log.info("reading file {}", path);
|
||||
|
||||
try (FileSystem fs = FileSystem.get(conf)) {
|
||||
final Path p = new Path(path);
|
||||
if (!fs.exists(p)) {
|
||||
throw new FileNotFoundException(path);
|
||||
}
|
||||
return IOUtils.toString(fs.open(p));
|
||||
}
|
||||
}
|
||||
|
||||
public static <T> T readHdfsFileAs(Configuration conf, String path, Class<T> clazz) throws IOException {
|
||||
return MAPPER.readValue(readHdfsFile(conf, path), clazz);
|
||||
}
|
||||
|
||||
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
|
||||
log.info("saving dataset in: {}", targetPath);
|
||||
mdstore
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.format("parquet")
|
||||
.save(targetPath);
|
||||
}
|
||||
|
||||
public static Configuration getHadoopConfiguration(String nameNode) {
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", nameNode);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
return conf;
|
||||
}
|
||||
|
||||
public static void populateOOZIEEnv(final Map<String, String> report) throws IOException {
|
||||
File file = new File(System.getProperty("oozie.action.output.properties"));
|
||||
Properties props = new Properties();
|
||||
report.forEach((k, v) -> props.setProperty(k, v));
|
||||
|
||||
try (OutputStream os = new FileOutputStream(file)) {
|
||||
props.store(os, "");
|
||||
}
|
||||
}
|
||||
|
||||
public static void populateOOZIEEnv(final String paramName, String value) throws IOException {
|
||||
Map<String, String> report = Maps.newHashMap();
|
||||
report.put(paramName, value);
|
||||
|
||||
populateOOZIEEnv(report);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,8 +15,8 @@ public class ISLookupClientFactory {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
|
||||
|
||||
private static int requestTimeout = 60000 * 10;
|
||||
private static int connectTimeout = 60000 * 10;
|
||||
private static final int requestTimeout = 60000 * 10;
|
||||
private static final int connectTimeout = 60000 * 10;
|
||||
|
||||
public static ISLookUpService getLookUpService(final String isLookupUrl) {
|
||||
return getServiceStub(ISLookUpService.class, isLookupUrl);
|
||||
|
|
|
@ -1,76 +0,0 @@
|
|||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class Message {
|
||||
|
||||
private String workflowId;
|
||||
|
||||
private String jobName;
|
||||
|
||||
private MessageType type;
|
||||
|
||||
private Map<String, String> body;
|
||||
|
||||
public static Message fromJson(final String json) throws IOException {
|
||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
||||
return jsonMapper.readValue(json, Message.class);
|
||||
}
|
||||
|
||||
public Message() {
|
||||
}
|
||||
|
||||
public Message(String workflowId, String jobName, MessageType type, Map<String, String> body) {
|
||||
this.workflowId = workflowId;
|
||||
this.jobName = jobName;
|
||||
this.type = type;
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
public String getWorkflowId() {
|
||||
return workflowId;
|
||||
}
|
||||
|
||||
public void setWorkflowId(String workflowId) {
|
||||
this.workflowId = workflowId;
|
||||
}
|
||||
|
||||
public String getJobName() {
|
||||
return jobName;
|
||||
}
|
||||
|
||||
public void setJobName(String jobName) {
|
||||
this.jobName = jobName;
|
||||
}
|
||||
|
||||
public MessageType getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(MessageType type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public Map<String, String> getBody() {
|
||||
return body;
|
||||
}
|
||||
|
||||
public void setBody(Map<String, String> body) {
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
||||
try {
|
||||
return jsonMapper.writeValueAsString(this);
|
||||
} catch (JsonProcessingException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,47 +0,0 @@
|
|||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import com.rabbitmq.client.AMQP;
|
||||
import com.rabbitmq.client.Channel;
|
||||
import com.rabbitmq.client.DefaultConsumer;
|
||||
import com.rabbitmq.client.Envelope;
|
||||
|
||||
public class MessageConsumer extends DefaultConsumer {
|
||||
|
||||
final LinkedBlockingQueue<Message> queueMessages;
|
||||
|
||||
/**
|
||||
* Constructs a new instance and records its association to the passed-in channel.
|
||||
*
|
||||
* @param channel the channel to which this consumer is attached
|
||||
* @param queueMessages
|
||||
*/
|
||||
public MessageConsumer(Channel channel, LinkedBlockingQueue<Message> queueMessages) {
|
||||
super(channel);
|
||||
this.queueMessages = queueMessages;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handleDelivery(
|
||||
String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body)
|
||||
throws IOException {
|
||||
final String json = new String(body, StandardCharsets.UTF_8);
|
||||
Message message = Message.fromJson(json);
|
||||
try {
|
||||
this.queueMessages.put(message);
|
||||
System.out.println("Receiving Message " + message);
|
||||
} catch (InterruptedException e) {
|
||||
if (message.getType() == MessageType.REPORT)
|
||||
throw new RuntimeException("Error on sending message");
|
||||
else {
|
||||
// TODO LOGGING EXCEPTION
|
||||
}
|
||||
} finally {
|
||||
getChannel().basicAck(envelope.getDeliveryTag(), false);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,136 +0,0 @@
|
|||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import com.rabbitmq.client.Channel;
|
||||
import com.rabbitmq.client.Connection;
|
||||
import com.rabbitmq.client.ConnectionFactory;
|
||||
|
||||
public class MessageManager {
|
||||
|
||||
private final String messageHost;
|
||||
|
||||
private final String username;
|
||||
|
||||
private final String password;
|
||||
|
||||
private Connection connection;
|
||||
|
||||
private final Map<String, Channel> channels = new HashMap<>();
|
||||
|
||||
private boolean durable;
|
||||
|
||||
private boolean autodelete;
|
||||
|
||||
private final LinkedBlockingQueue<Message> queueMessages;
|
||||
|
||||
public MessageManager(
|
||||
String messageHost,
|
||||
String username,
|
||||
String password,
|
||||
final LinkedBlockingQueue<Message> queueMessages) {
|
||||
this.queueMessages = queueMessages;
|
||||
this.messageHost = messageHost;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
}
|
||||
|
||||
public MessageManager(
|
||||
String messageHost,
|
||||
String username,
|
||||
String password,
|
||||
boolean durable,
|
||||
boolean autodelete,
|
||||
final LinkedBlockingQueue<Message> queueMessages) {
|
||||
this.queueMessages = queueMessages;
|
||||
this.messageHost = messageHost;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
|
||||
this.durable = durable;
|
||||
this.autodelete = autodelete;
|
||||
}
|
||||
|
||||
private Connection createConnection() throws IOException, TimeoutException {
|
||||
ConnectionFactory factory = new ConnectionFactory();
|
||||
factory.setHost(this.messageHost);
|
||||
factory.setUsername(this.username);
|
||||
factory.setPassword(this.password);
|
||||
return factory.newConnection();
|
||||
}
|
||||
|
||||
private Channel createChannel(
|
||||
final Connection connection,
|
||||
final String queueName,
|
||||
final boolean durable,
|
||||
final boolean autodelete)
|
||||
throws Exception {
|
||||
Map<String, Object> args = new HashMap<>();
|
||||
args.put("x-message-ttl", 10000);
|
||||
Channel channel = connection.createChannel();
|
||||
channel.queueDeclare(queueName, durable, false, this.autodelete, args);
|
||||
return channel;
|
||||
}
|
||||
|
||||
private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete)
|
||||
throws Exception {
|
||||
if (channels.containsKey(queueName)) {
|
||||
return channels.get(queueName);
|
||||
}
|
||||
|
||||
if (this.connection == null) {
|
||||
this.connection = createConnection();
|
||||
}
|
||||
channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete));
|
||||
return channels.get(queueName);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
channels
|
||||
.values()
|
||||
.forEach(
|
||||
ch -> {
|
||||
try {
|
||||
ch.close();
|
||||
} catch (Exception e) {
|
||||
// TODO LOG
|
||||
}
|
||||
});
|
||||
|
||||
this.connection.close();
|
||||
}
|
||||
|
||||
public boolean sendMessage(final Message message, String queueName) throws Exception {
|
||||
try {
|
||||
Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete);
|
||||
channel.basicPublish("", queueName, null, message.toString().getBytes());
|
||||
return true;
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean sendMessage(
|
||||
final Message message, String queueName, boolean durable_var, boolean autodelete_var)
|
||||
throws Exception {
|
||||
try {
|
||||
Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var);
|
||||
channel.basicPublish("", queueName, null, message.toString().getBytes());
|
||||
return true;
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void startConsumingMessage(
|
||||
final String queueName, final boolean durable, final boolean autodelete) throws Exception {
|
||||
|
||||
Channel channel = createChannel(createConnection(), queueName, durable, autodelete);
|
||||
channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages));
|
||||
}
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
public enum MessageType {
|
||||
ONGOING, REPORT
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -1,16 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.model.mdstore;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class MetadataRecordTest {
|
||||
|
||||
@Test
|
||||
public void getTimestamp() {
|
||||
|
||||
MetadataRecord r = new MetadataRecord();
|
||||
assertTrue(r.getDateOfCollection() > 0);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,180 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class OafMapperUtilsTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@Test
|
||||
public void testDateValidation() {
|
||||
|
||||
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
|
||||
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
|
||||
assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent());
|
||||
|
||||
assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
|
||||
|
||||
assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
|
||||
assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
|
||||
assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
|
||||
assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
|
||||
assertEquals(
|
||||
"2015-07-03",
|
||||
GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
|
||||
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
|
||||
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
|
||||
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
|
||||
assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
|
||||
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
|
||||
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
|
||||
assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
|
||||
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
|
||||
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
|
||||
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
|
||||
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
|
||||
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
|
||||
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
|
||||
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
|
||||
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
|
||||
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
|
||||
assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
|
||||
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
|
||||
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
|
||||
assertEquals(
|
||||
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
|
||||
assertEquals(
|
||||
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
|
||||
assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
|
||||
assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
|
||||
assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
|
||||
assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
|
||||
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
|
||||
assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
|
||||
assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
|
||||
assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
|
||||
assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDate() {
|
||||
System.out.println(GraphCleaningFunctions.cleanDate("23-FEB-1998"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMergePubs() throws IOException {
|
||||
Publication p1 = read("publication_1.json", Publication.class);
|
||||
Publication p2 = read("publication_2.json", Publication.class);
|
||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
||||
Dataset d2 = read("dataset_2.json", Dataset.class);
|
||||
|
||||
assertEquals(p1.getCollectedfrom().size(), 1);
|
||||
assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
|
||||
assertEquals(d2.getCollectedfrom().size(), 1);
|
||||
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
|
||||
assertTrue(
|
||||
OafMapperUtils
|
||||
.mergeResults(p1, d2)
|
||||
.getResulttype()
|
||||
.getClassid()
|
||||
.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
|
||||
|
||||
assertEquals(p2.getCollectedfrom().size(), 1);
|
||||
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
assertEquals(d1.getCollectedfrom().size(), 1);
|
||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
|
||||
assertTrue(
|
||||
OafMapperUtils
|
||||
.mergeResults(p2, d1)
|
||||
.getResulttype()
|
||||
.getClassid()
|
||||
.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
|
||||
}
|
||||
|
||||
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
|
||||
return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||
return OBJECT_MAPPER.readValue(json, clazz);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,51 +0,0 @@
|
|||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class MessageTest {
|
||||
|
||||
@Test
|
||||
public void fromJsonTest() throws IOException {
|
||||
Message m = new Message();
|
||||
m.setWorkflowId("wId");
|
||||
m.setType(MessageType.ONGOING);
|
||||
m.setJobName("Collection");
|
||||
Map<String, String> body = new HashMap<>();
|
||||
body.put("parsedItem", "300");
|
||||
body.put("ExecutionTime", "30s");
|
||||
|
||||
m.setBody(body);
|
||||
System.out.println("m = " + m);
|
||||
Message m1 = Message.fromJson(m.toString());
|
||||
assertEquals(m1.getWorkflowId(), m.getWorkflowId());
|
||||
assertEquals(m1.getType(), m.getType());
|
||||
assertEquals(m1.getJobName(), m.getJobName());
|
||||
|
||||
assertNotNull(m1.getBody());
|
||||
m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it)));
|
||||
assertEquals(m1.getJobName(), m.getJobName());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void toStringTest() {
|
||||
final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}";
|
||||
Message m = new Message();
|
||||
m.setWorkflowId("wId");
|
||||
m.setType(MessageType.ONGOING);
|
||||
m.setJobName("Collection");
|
||||
Map<String, String> body = new HashMap<>();
|
||||
body.put("parsedItem", "300");
|
||||
body.put("ExecutionTime", "30s");
|
||||
|
||||
m.setBody(body);
|
||||
|
||||
assertEquals(expectedJson, m.toString());
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]}
|
|
@ -51,16 +51,6 @@
|
|||
<artifactId>hadoop-distcp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaire-data-protos</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-api</artifactId>
|
||||
|
|
|
@ -1,69 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.migration;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
||||
|
||||
public class LicenseComparator implements Comparator<Qualifier> {
|
||||
|
||||
@Override
|
||||
public int compare(Qualifier left, Qualifier right) {
|
||||
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return 1;
|
||||
if (right == null)
|
||||
return -1;
|
||||
|
||||
String lClass = left.getClassid();
|
||||
String rClass = right.getClassid();
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals("OPEN SOURCE"))
|
||||
return -1;
|
||||
if (rClass.equals("OPEN SOURCE"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("OPEN"))
|
||||
return -1;
|
||||
if (rClass.equals("OPEN"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("6MONTHS"))
|
||||
return -1;
|
||||
if (rClass.equals("6MONTHS"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("12MONTHS"))
|
||||
return -1;
|
||||
if (rClass.equals("12MONTHS"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("EMBARGO"))
|
||||
return -1;
|
||||
if (rClass.equals("EMBARGO"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("RESTRICTED"))
|
||||
return -1;
|
||||
if (rClass.equals("RESTRICTED"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("CLOSED"))
|
||||
return -1;
|
||||
if (rClass.equals("CLOSED"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("UNKNOWN"))
|
||||
return -1;
|
||||
if (rClass.equals("UNKNOWN"))
|
||||
return 1;
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
}
|
|
@ -1,196 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.migration;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.tools.DistCp;
|
||||
import org.apache.hadoop.tools.DistCpOptions;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
public class MigrateActionSet {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class);
|
||||
|
||||
private static final String SEPARATOR = "/";
|
||||
private static final String TARGET_PATHS = "target_paths";
|
||||
private static final String RAWSET_PREFIX = "rawset_";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
MigrateActionSet.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
new MigrateActionSet().run(parser);
|
||||
}
|
||||
|
||||
private void run(ArgumentApplicationParser parser) throws Exception {
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
final String sourceNN = parser.get("sourceNameNode");
|
||||
final String targetNN = parser.get("targetNameNode");
|
||||
final String workDir = parser.get("workingDirectory");
|
||||
final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps"));
|
||||
|
||||
final String distcp_memory_mb = parser.get("distcp_memory_mb");
|
||||
final String distcp_task_timeout = parser.get("distcp_task_timeout");
|
||||
|
||||
final String transform_only_s = parser.get("transform_only");
|
||||
|
||||
log.info("transform only param: {}", transform_only_s);
|
||||
|
||||
final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
|
||||
|
||||
log.info("transform only: {}", transformOnly);
|
||||
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
|
||||
Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
|
||||
FileSystem targetFS = FileSystem.get(conf);
|
||||
|
||||
Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
|
||||
sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN);
|
||||
FileSystem sourceFS = FileSystem.get(sourceConf);
|
||||
|
||||
Properties props = new Properties();
|
||||
|
||||
List<Path> targetPaths = new ArrayList<>();
|
||||
|
||||
final List<Path> sourcePaths = getSourcePaths(sourceNN, isLookUp);
|
||||
log
|
||||
.info(
|
||||
"paths to process:\n{}", sourcePaths
|
||||
.stream()
|
||||
.map(p -> p.toString())
|
||||
.collect(Collectors.joining("\n")));
|
||||
|
||||
for (Path source : sourcePaths) {
|
||||
|
||||
if (!sourceFS.exists(source)) {
|
||||
log.warn("skipping unexisting path: {}", source);
|
||||
} else {
|
||||
|
||||
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
|
||||
|
||||
final String rawSet = pathQ.pollLast();
|
||||
log.info("got RAWSET: {}", rawSet);
|
||||
|
||||
if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
|
||||
|
||||
final String actionSetDirectory = pathQ.pollLast();
|
||||
|
||||
final Path targetPath = new Path(
|
||||
targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet);
|
||||
|
||||
log.info("using TARGET PATH: {}", targetPath);
|
||||
|
||||
if (!transformOnly) {
|
||||
if (targetFS.exists(targetPath)) {
|
||||
targetFS.delete(targetPath, true);
|
||||
}
|
||||
runDistcp(
|
||||
distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath);
|
||||
}
|
||||
|
||||
targetPaths.add(targetPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final String targetPathsCsv = targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(","));
|
||||
props.setProperty(TARGET_PATHS, targetPathsCsv);
|
||||
File file = new File(System.getProperty("oozie.action.output.properties"));
|
||||
|
||||
try (OutputStream os = new FileOutputStream(file)) {
|
||||
props.store(os, "");
|
||||
}
|
||||
System.out.println(file.getAbsolutePath());
|
||||
}
|
||||
|
||||
private void runDistcp(
|
||||
Integer distcp_num_maps,
|
||||
String distcp_memory_mb,
|
||||
String distcp_task_timeout,
|
||||
Configuration conf,
|
||||
Path source,
|
||||
Path targetPath)
|
||||
throws Exception {
|
||||
|
||||
final DistCpOptions op = new DistCpOptions(source, targetPath);
|
||||
op.setMaxMaps(distcp_num_maps);
|
||||
op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE);
|
||||
op.preserve(DistCpOptions.FileAttribute.REPLICATION);
|
||||
op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE);
|
||||
|
||||
int res = ToolRunner
|
||||
.run(
|
||||
new DistCp(conf, op),
|
||||
new String[] {
|
||||
"-Dmapred.task.timeout=" + distcp_task_timeout,
|
||||
"-Dmapreduce.map.memory.mb=" + distcp_memory_mb,
|
||||
"-pb",
|
||||
"-m " + distcp_num_maps,
|
||||
source.toString(),
|
||||
targetPath.toString()
|
||||
});
|
||||
|
||||
if (res != 0) {
|
||||
throw new RuntimeException(String.format("distcp exited with code %s", res));
|
||||
}
|
||||
}
|
||||
|
||||
private Configuration getConfiguration(
|
||||
String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) {
|
||||
final Configuration conf = new Configuration();
|
||||
conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout);
|
||||
conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout);
|
||||
conf.set("dfs.http.client.retry.policy.enabled", "true");
|
||||
conf.set("mapred.task.timeout", distcp_task_timeout);
|
||||
conf.set("mapreduce.map.memory.mb", distcp_memory_mb);
|
||||
conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps));
|
||||
return conf;
|
||||
}
|
||||
|
||||
private List<Path> getSourcePaths(String sourceNN, ISLookUpService isLookUp)
|
||||
throws ISLookUpException {
|
||||
String XQUERY = "distinct-values(\n"
|
||||
+ "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n"
|
||||
+ "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n"
|
||||
+ "let $setDir := $x//SET/@directory/string()\n"
|
||||
+ "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n"
|
||||
+ "return concat($basePath, '/', $setDir, '/', $rawSet))";
|
||||
|
||||
log.info(String.format("running xquery:\n%s", XQUERY));
|
||||
return isLookUp
|
||||
.quickSearchProfile(XQUERY)
|
||||
.stream()
|
||||
.map(p -> sourceNN + p)
|
||||
.map(Path::new)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
|
@ -1,710 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.migration;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.googlecode.protobuf.format.JsonFormat;
|
||||
|
||||
import eu.dnetlib.data.proto.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class ProtoConverter implements Serializable {
|
||||
|
||||
public static Oaf convert(OafProtos.Oaf oaf) {
|
||||
try {
|
||||
switch (oaf.getKind()) {
|
||||
case entity:
|
||||
return convertEntity(oaf);
|
||||
case relation:
|
||||
return convertRelation(oaf);
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid kind " + oaf.getKind());
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Relation convertRelation(OafProtos.Oaf oaf) {
|
||||
final OafProtos.OafRel r = oaf.getRel();
|
||||
final Relation rel = new Relation();
|
||||
rel.setDataInfo(mapDataInfo(oaf.getDataInfo()));
|
||||
rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp());
|
||||
rel.setSource(r.getSource());
|
||||
rel.setTarget(r.getTarget());
|
||||
rel.setRelType(r.getRelType().toString());
|
||||
rel.setSubRelType(r.getSubRelType().toString());
|
||||
rel.setRelClass(r.getRelClass());
|
||||
rel
|
||||
.setCollectedfrom(
|
||||
r.getCollectedfromCount() > 0
|
||||
? r.getCollectedfromList().stream().map(kv -> mapKV(kv)).collect(Collectors.toList())
|
||||
: null);
|
||||
return rel;
|
||||
}
|
||||
|
||||
private static OafEntity convertEntity(OafProtos.Oaf oaf) {
|
||||
|
||||
switch (oaf.getEntity().getType()) {
|
||||
case result:
|
||||
final Result r = convertResult(oaf);
|
||||
r.setInstance(convertInstances(oaf));
|
||||
r.setExternalReference(convertExternalRefs(oaf));
|
||||
return r;
|
||||
case project:
|
||||
return convertProject(oaf);
|
||||
case datasource:
|
||||
return convertDataSource(oaf);
|
||||
case organization:
|
||||
return convertOrganization(oaf);
|
||||
default:
|
||||
throw new RuntimeException("received unknown type");
|
||||
}
|
||||
}
|
||||
|
||||
private static List<Instance> convertInstances(OafProtos.Oaf oaf) {
|
||||
|
||||
final ResultProtos.Result r = oaf.getEntity().getResult();
|
||||
if (r.getInstanceCount() > 0) {
|
||||
return r.getInstanceList().stream().map(i -> convertInstance(i)).collect(Collectors.toList());
|
||||
}
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
||||
private static Instance convertInstance(ResultProtos.Result.Instance ri) {
|
||||
final Instance i = new Instance();
|
||||
i.setAccessright(mapQualifier(ri.getAccessright()));
|
||||
i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
|
||||
i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
|
||||
i.setDistributionlocation(ri.getDistributionlocation());
|
||||
i.setHostedby(mapKV(ri.getHostedby()));
|
||||
i.setInstancetype(mapQualifier(ri.getInstancetype()));
|
||||
i.setLicense(mapStringField(ri.getLicense()));
|
||||
i
|
||||
.setUrl(
|
||||
ri.getUrlList() != null ? ri
|
||||
.getUrlList()
|
||||
.stream()
|
||||
.distinct()
|
||||
.collect(Collectors.toCollection(ArrayList::new)) : null);
|
||||
i.setRefereed(mapRefereed(ri.getRefereed()));
|
||||
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
|
||||
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
|
||||
return i;
|
||||
}
|
||||
|
||||
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
|
||||
Qualifier q = new Qualifier();
|
||||
q.setClassid(refereed.getValue());
|
||||
q.setSchemename(refereed.getValue());
|
||||
q.setSchemeid("dnet:review_levels");
|
||||
q.setSchemename("dnet:review_levels");
|
||||
return q;
|
||||
}
|
||||
|
||||
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result r = oaf.getEntity().getResult();
|
||||
if (r.getExternalReferenceCount() > 0) {
|
||||
return r
|
||||
.getExternalReferenceList()
|
||||
.stream()
|
||||
.map(e -> convertExtRef(e))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
||||
private static ExternalReference convertExtRef(ResultProtos.Result.ExternalReference e) {
|
||||
ExternalReference ex = new ExternalReference();
|
||||
ex.setUrl(e.getUrl());
|
||||
ex.setSitename(e.getSitename());
|
||||
ex.setRefidentifier(e.getRefidentifier());
|
||||
ex.setQuery(e.getQuery());
|
||||
ex.setQualifier(mapQualifier(e.getQualifier()));
|
||||
ex.setLabel(e.getLabel());
|
||||
ex.setDescription(e.getDescription());
|
||||
ex.setDataInfo(ex.getDataInfo());
|
||||
return ex;
|
||||
}
|
||||
|
||||
private static Organization convertOrganization(OafProtos.Oaf oaf) {
|
||||
final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata();
|
||||
final Organization org = setOaf(new Organization(), oaf);
|
||||
setEntity(org, oaf);
|
||||
org.setLegalshortname(mapStringField(m.getLegalshortname()));
|
||||
org.setLegalname(mapStringField(m.getLegalname()));
|
||||
org
|
||||
.setAlternativeNames(
|
||||
m
|
||||
.getAlternativeNamesList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
org.setWebsiteurl(mapStringField(m.getWebsiteurl()));
|
||||
org.setLogourl(mapStringField(m.getLogourl()));
|
||||
org.setEclegalbody(mapStringField(m.getEclegalbody()));
|
||||
org.setEclegalperson(mapStringField(m.getEclegalperson()));
|
||||
org.setEcnonprofit(mapStringField(m.getEcnonprofit()));
|
||||
org.setEcresearchorganization(mapStringField(m.getEcresearchorganization()));
|
||||
org.setEchighereducation(mapStringField(m.getEchighereducation()));
|
||||
org
|
||||
.setEcinternationalorganizationeurinterests(
|
||||
mapStringField(m.getEcinternationalorganizationeurinterests()));
|
||||
org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization()));
|
||||
org.setEcenterprise(mapStringField(m.getEcenterprise()));
|
||||
org.setEcsmevalidated(mapStringField(m.getEcsmevalidated()));
|
||||
org.setEcnutscode(mapStringField(m.getEcnutscode()));
|
||||
org.setCountry(mapQualifier(m.getCountry()));
|
||||
|
||||
return org;
|
||||
}
|
||||
|
||||
private static Datasource convertDataSource(OafProtos.Oaf oaf) {
|
||||
final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata();
|
||||
final Datasource datasource = setOaf(new Datasource(), oaf);
|
||||
setEntity(datasource, oaf);
|
||||
datasource
|
||||
.setAccessinfopackage(
|
||||
m
|
||||
.getAccessinfopackageList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setCertificates(mapStringField(m.getCertificates()));
|
||||
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
|
||||
datasource.setContactemail(mapStringField(m.getContactemail()));
|
||||
datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction()));
|
||||
datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype()));
|
||||
datasource.setDataprovider(mapBoolField(m.getDataprovider()));
|
||||
datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype()));
|
||||
datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction()));
|
||||
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
|
||||
datasource.setDatauploadtype(mapStringField(m.getDatauploadtype()));
|
||||
datasource.setDateofvalidation(mapStringField(m.getDateofvalidation()));
|
||||
datasource.setDescription(mapStringField(m.getDescription()));
|
||||
datasource.setEnglishname(mapStringField(m.getEnglishname()));
|
||||
datasource.setLatitude(mapStringField(m.getLatitude()));
|
||||
datasource.setLongitude(mapStringField(m.getLongitude()));
|
||||
datasource.setLogourl(mapStringField(m.getLogourl()));
|
||||
datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl()));
|
||||
datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix()));
|
||||
datasource
|
||||
.setOdcontenttypes(
|
||||
m
|
||||
.getOdcontenttypesList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
datasource
|
||||
.setOdlanguages(
|
||||
m
|
||||
.getOdlanguagesList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems()));
|
||||
datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate()));
|
||||
datasource.setOdpolicies(mapStringField(m.getOdpolicies()));
|
||||
datasource.setOfficialname(mapStringField(m.getOfficialname()));
|
||||
datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility()));
|
||||
datasource.setPidsystems(mapStringField(m.getPidsystems()));
|
||||
datasource
|
||||
.setPolicies(
|
||||
m.getPoliciesList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList()));
|
||||
datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind()));
|
||||
datasource.setReleaseenddate(mapStringField(m.getReleaseenddate()));
|
||||
datasource.setServiceprovider(mapBoolField(m.getServiceprovider()));
|
||||
datasource.setReleasestartdate(mapStringField(m.getReleasestartdate()));
|
||||
datasource
|
||||
.setSubjects(
|
||||
m
|
||||
.getSubjectsList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setVersioning(mapBoolField(m.getVersioning()));
|
||||
datasource.setWebsiteurl(mapStringField(m.getWebsiteurl()));
|
||||
datasource.setJournal(mapJournal(m.getJournal()));
|
||||
|
||||
return datasource;
|
||||
}
|
||||
|
||||
private static Project convertProject(OafProtos.Oaf oaf) {
|
||||
final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata();
|
||||
final Project project = setOaf(new Project(), oaf);
|
||||
setEntity(project, oaf);
|
||||
project.setAcronym(mapStringField(m.getAcronym()));
|
||||
project.setCallidentifier(mapStringField(m.getCallidentifier()));
|
||||
project.setCode(mapStringField(m.getCode()));
|
||||
project.setContactemail(mapStringField(m.getContactemail()));
|
||||
project.setContactfax(mapStringField(m.getContactfax()));
|
||||
project.setContactfullname(mapStringField(m.getContactfullname()));
|
||||
project.setContactphone(mapStringField(m.getContactphone()));
|
||||
project.setContracttype(mapQualifier(m.getContracttype()));
|
||||
project.setCurrency(mapStringField(m.getCurrency()));
|
||||
project.setDuration(mapStringField(m.getDuration()));
|
||||
project.setEcarticle29_3(mapStringField(m.getEcarticle293()));
|
||||
project.setEcsc39(mapStringField(m.getEcsc39()));
|
||||
project.setOamandatepublications(mapStringField(m.getOamandatepublications()));
|
||||
project.setStartdate(mapStringField(m.getStartdate()));
|
||||
project.setEnddate(mapStringField(m.getEnddate()));
|
||||
project.setFundedamount(m.getFundedamount());
|
||||
project.setTotalcost(m.getTotalcost());
|
||||
project.setKeywords(mapStringField(m.getKeywords()));
|
||||
project
|
||||
.setSubjects(
|
||||
m
|
||||
.getSubjectsList()
|
||||
.stream()
|
||||
.map(sp -> mapStructuredProperty(sp))
|
||||
.collect(Collectors.toList()));
|
||||
project.setTitle(mapStringField(m.getTitle()));
|
||||
project.setWebsiteurl(mapStringField(m.getWebsiteurl()));
|
||||
project
|
||||
.setFundingtree(
|
||||
m.getFundingtreeList().stream().map(f -> mapStringField(f)).collect(Collectors.toList()));
|
||||
project.setJsonextrainfo(mapStringField(m.getJsonextrainfo()));
|
||||
project.setSummary(mapStringField(m.getSummary()));
|
||||
project.setOptional1(mapStringField(m.getOptional1()));
|
||||
project.setOptional2(mapStringField(m.getOptional2()));
|
||||
return project;
|
||||
}
|
||||
|
||||
private static Result convertResult(OafProtos.Oaf oaf) {
|
||||
switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) {
|
||||
case "dataset":
|
||||
return createDataset(oaf);
|
||||
case "publication":
|
||||
return createPublication(oaf);
|
||||
case "software":
|
||||
return createSoftware(oaf);
|
||||
case "other":
|
||||
return createORP(oaf);
|
||||
default:
|
||||
Result result = setOaf(new Result(), oaf);
|
||||
setEntity(result, oaf);
|
||||
return setResult(result, oaf);
|
||||
}
|
||||
}
|
||||
|
||||
private static Software createSoftware(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
Software software = setOaf(new Software(), oaf);
|
||||
setEntity(software, oaf);
|
||||
setResult(software, oaf);
|
||||
|
||||
software
|
||||
.setDocumentationUrl(
|
||||
m
|
||||
.getDocumentationUrlList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
software
|
||||
.setLicense(
|
||||
m
|
||||
.getLicenseList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl()));
|
||||
software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage()));
|
||||
return software;
|
||||
}
|
||||
|
||||
private static OtherResearchProduct createORP(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf);
|
||||
setEntity(otherResearchProducts, oaf);
|
||||
setResult(otherResearchProducts, oaf);
|
||||
otherResearchProducts
|
||||
.setContactperson(
|
||||
m
|
||||
.getContactpersonList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
otherResearchProducts
|
||||
.setContactgroup(
|
||||
m
|
||||
.getContactgroupList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
otherResearchProducts
|
||||
.setTool(
|
||||
m.getToolList().stream().map(ProtoConverter::mapStringField).collect(Collectors.toList()));
|
||||
|
||||
return otherResearchProducts;
|
||||
}
|
||||
|
||||
private static Publication createPublication(OafProtos.Oaf oaf) {
|
||||
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
Publication publication = setOaf(new Publication(), oaf);
|
||||
setEntity(publication, oaf);
|
||||
setResult(publication, oaf);
|
||||
publication.setJournal(mapJournal(m.getJournal()));
|
||||
return publication;
|
||||
}
|
||||
|
||||
private static Dataset createDataset(OafProtos.Oaf oaf) {
|
||||
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
Dataset dataset = setOaf(new Dataset(), oaf);
|
||||
setEntity(dataset, oaf);
|
||||
setResult(dataset, oaf);
|
||||
dataset.setStoragedate(mapStringField(m.getStoragedate()));
|
||||
dataset.setDevice(mapStringField(m.getDevice()));
|
||||
dataset.setSize(mapStringField(m.getSize()));
|
||||
dataset.setVersion(mapStringField(m.getVersion()));
|
||||
dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate()));
|
||||
dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber()));
|
||||
dataset
|
||||
.setGeolocation(
|
||||
m
|
||||
.getGeolocationList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapGeolocation)
|
||||
.collect(Collectors.toList()));
|
||||
return dataset;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T setOaf(T oaf, OafProtos.Oaf o) {
|
||||
oaf.setDataInfo(mapDataInfo(o.getDataInfo()));
|
||||
oaf.setLastupdatetimestamp(o.getLastupdatetimestamp());
|
||||
return oaf;
|
||||
}
|
||||
|
||||
public static <T extends OafEntity> T setEntity(T entity, OafProtos.Oaf oaf) {
|
||||
// setting Entity fields
|
||||
final OafProtos.OafEntity e = oaf.getEntity();
|
||||
entity.setId(e.getId());
|
||||
entity.setOriginalId(e.getOriginalIdList());
|
||||
entity
|
||||
.setCollectedfrom(
|
||||
e.getCollectedfromList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList()));
|
||||
entity
|
||||
.setPid(
|
||||
e
|
||||
.getPidList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setDateofcollection(e.getDateofcollection());
|
||||
entity.setDateoftransformation(e.getDateoftransformation());
|
||||
entity
|
||||
.setExtraInfo(
|
||||
e
|
||||
.getExtraInfoList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapExtraInfo)
|
||||
.collect(Collectors.toList()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static <T extends Result> T setResult(T entity, OafProtos.Oaf oaf) {
|
||||
// setting Entity fields
|
||||
final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
entity
|
||||
.setAuthor(
|
||||
m.getAuthorList().stream().map(ProtoConverter::mapAuthor).collect(Collectors.toList()));
|
||||
entity.setResulttype(mapQualifier(m.getResulttype()));
|
||||
entity.setLanguage(mapQualifier(m.getLanguage()));
|
||||
entity
|
||||
.setCountry(
|
||||
m
|
||||
.getCountryList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapQualifierAsCountry)
|
||||
.collect(Collectors.toList()));
|
||||
entity
|
||||
.setSubject(
|
||||
m
|
||||
.getSubjectList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity
|
||||
.setTitle(
|
||||
m
|
||||
.getTitleList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity
|
||||
.setRelevantdate(
|
||||
m
|
||||
.getRelevantdateList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity
|
||||
.setDescription(
|
||||
m
|
||||
.getDescriptionList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setDateofacceptance(mapStringField(m.getDateofacceptance()));
|
||||
entity.setPublisher(mapStringField(m.getPublisher()));
|
||||
entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate()));
|
||||
entity
|
||||
.setSource(
|
||||
m
|
||||
.getSourceList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity
|
||||
.setFulltext(
|
||||
m
|
||||
.getFulltextList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity
|
||||
.setFormat(
|
||||
m
|
||||
.getFormatList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity
|
||||
.setContributor(
|
||||
m
|
||||
.getContributorList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setResourcetype(mapQualifier(m.getResourcetype()));
|
||||
entity
|
||||
.setCoverage(
|
||||
m
|
||||
.getCoverageList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity
|
||||
.setContext(
|
||||
m.getContextList().stream().map(ProtoConverter::mapContext).collect(Collectors.toList()));
|
||||
|
||||
entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList()));
|
||||
|
||||
return entity;
|
||||
}
|
||||
|
||||
private static Qualifier getBestAccessRights(List<ResultProtos.Result.Instance> instanceList) {
|
||||
if (instanceList != null) {
|
||||
final Optional<FieldTypeProtos.Qualifier> min = instanceList
|
||||
.stream()
|
||||
.map(i -> i.getAccessright())
|
||||
.min(new LicenseComparator());
|
||||
|
||||
final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier();
|
||||
|
||||
if (StringUtils.isBlank(rights.getClassid())) {
|
||||
rights.setClassid(UNKNOWN);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getClassname())
|
||||
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
||||
rights.setClassname(NOT_AVAILABLE);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemeid())) {
|
||||
rights.setSchemeid(DNET_ACCESS_MODES);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemename())) {
|
||||
rights.setSchemename(DNET_ACCESS_MODES);
|
||||
}
|
||||
|
||||
return rights;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static Context mapContext(ResultProtos.Result.Context context) {
|
||||
if (context == null || StringUtils.isBlank(context.getId())) {
|
||||
return null;
|
||||
}
|
||||
final Context entity = new Context();
|
||||
entity.setId(context.getId());
|
||||
entity
|
||||
.setDataInfo(
|
||||
context
|
||||
.getDataInfoList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapDataInfo)
|
||||
.collect(Collectors.toList()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
|
||||
if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final KeyValue keyValue = new KeyValue();
|
||||
keyValue.setKey(kv.getKey());
|
||||
keyValue.setValue(kv.getValue());
|
||||
keyValue.setDataInfo(mapDataInfo(kv.getDataInfo()));
|
||||
return keyValue;
|
||||
}
|
||||
|
||||
public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) {
|
||||
final DataInfo dataInfo = new DataInfo();
|
||||
dataInfo.setDeletedbyinference(d.getDeletedbyinference());
|
||||
dataInfo.setInferenceprovenance(d.getInferenceprovenance());
|
||||
dataInfo.setInferred(d.getInferred());
|
||||
dataInfo.setInvisible(d.getInvisible());
|
||||
dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction()));
|
||||
dataInfo.setTrust(d.getTrust());
|
||||
return dataInfo;
|
||||
}
|
||||
|
||||
public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) {
|
||||
final Qualifier qualifier = new Qualifier();
|
||||
qualifier.setClassid(q.getClassid());
|
||||
qualifier.setClassname(q.getClassname());
|
||||
qualifier.setSchemeid(q.getSchemeid());
|
||||
qualifier.setSchemename(q.getSchemename());
|
||||
return qualifier;
|
||||
}
|
||||
|
||||
public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
|
||||
final Country c = new Country();
|
||||
c.setClassid(q.getClassid());
|
||||
c.setClassname(q.getClassname());
|
||||
c.setSchemeid(q.getSchemeid());
|
||||
c.setSchemename(q.getSchemename());
|
||||
c.setDataInfo(mapDataInfo(q.getDataInfo()));
|
||||
return c;
|
||||
}
|
||||
|
||||
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
|
||||
if (sp == null | StringUtils.isBlank(sp.getValue())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final StructuredProperty structuredProperty = new StructuredProperty();
|
||||
structuredProperty.setValue(sp.getValue());
|
||||
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
|
||||
structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo()));
|
||||
return structuredProperty;
|
||||
}
|
||||
|
||||
public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) {
|
||||
final ExtraInfo entity = new ExtraInfo();
|
||||
entity.setName(extraInfo.getName());
|
||||
entity.setTypology(extraInfo.getTypology());
|
||||
entity.setProvenance(extraInfo.getProvenance());
|
||||
entity.setTrust(extraInfo.getTrust());
|
||||
entity.setValue(extraInfo.getValue());
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) {
|
||||
final OAIProvenance entity = new OAIProvenance();
|
||||
entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static OriginDescription mapOriginalDescription(
|
||||
FieldTypeProtos.OAIProvenance.OriginDescription originDescription) {
|
||||
final OriginDescription originDescriptionResult = new OriginDescription();
|
||||
originDescriptionResult.setHarvestDate(originDescription.getHarvestDate());
|
||||
originDescriptionResult.setAltered(originDescription.getAltered());
|
||||
originDescriptionResult.setBaseURL(originDescription.getBaseURL());
|
||||
originDescriptionResult.setIdentifier(originDescription.getIdentifier());
|
||||
originDescriptionResult.setDatestamp(originDescription.getDatestamp());
|
||||
originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace());
|
||||
return originDescriptionResult;
|
||||
}
|
||||
|
||||
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
|
||||
if (s == null || StringUtils.isBlank(s.getValue())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final Field<String> stringField = new Field<>();
|
||||
stringField.setValue(s.getValue());
|
||||
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
|
||||
return stringField;
|
||||
}
|
||||
|
||||
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
|
||||
if (b == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final Field<Boolean> booleanField = new Field<>();
|
||||
booleanField.setValue(b.getValue());
|
||||
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
|
||||
return booleanField;
|
||||
}
|
||||
|
||||
public static Journal mapJournal(FieldTypeProtos.Journal j) {
|
||||
final Journal journal = new Journal();
|
||||
journal.setConferencedate(j.getConferencedate());
|
||||
journal.setConferenceplace(j.getConferenceplace());
|
||||
journal.setEdition(j.getEdition());
|
||||
journal.setEp(j.getEp());
|
||||
journal.setIss(j.getIss());
|
||||
journal.setIssnLinking(j.getIssnLinking());
|
||||
journal.setIssnOnline(j.getIssnOnline());
|
||||
journal.setIssnPrinted(j.getIssnPrinted());
|
||||
journal.setName(j.getName());
|
||||
journal.setSp(j.getSp());
|
||||
journal.setVol(j.getVol());
|
||||
journal.setDataInfo(mapDataInfo(j.getDataInfo()));
|
||||
return journal;
|
||||
}
|
||||
|
||||
public static Author mapAuthor(FieldTypeProtos.Author author) {
|
||||
final Author entity = new Author();
|
||||
entity.setFullname(author.getFullname());
|
||||
entity.setName(author.getName());
|
||||
entity.setSurname(author.getSurname());
|
||||
entity.setRank(author.getRank());
|
||||
entity
|
||||
.setPid(
|
||||
author
|
||||
.getPidList()
|
||||
.stream()
|
||||
.map(
|
||||
kv -> {
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(kv.getValue());
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(kv.getKey());
|
||||
q.setClassname(kv.getKey());
|
||||
sp.setQualifier(q);
|
||||
return sp;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
entity
|
||||
.setAffiliation(
|
||||
author
|
||||
.getAffiliationList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) {
|
||||
final GeoLocation entity = new GeoLocation();
|
||||
entity.setPoint(geoLocation.getPoint());
|
||||
entity.setBox(geoLocation.getBox());
|
||||
entity.setPlace(geoLocation.getPlace());
|
||||
return entity;
|
||||
}
|
||||
}
|
|
@ -1,172 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.migration;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
|
||||
import eu.dnetlib.data.proto.OafProtos;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class TransformActions implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(TransformActions.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final String SEPARATOR = "/";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
MigrateActionSet.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
|
||||
final String inputPaths = parser.get("inputPaths");
|
||||
|
||||
if (StringUtils.isBlank(inputPaths)) {
|
||||
throw new RuntimeException("empty inputPaths");
|
||||
}
|
||||
log.info("inputPaths: {}", inputPaths);
|
||||
|
||||
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf, isSparkSessionManaged, spark -> transformActions(inputPaths, targetBaseDir, spark));
|
||||
}
|
||||
|
||||
private static void transformActions(String inputPaths, String targetBaseDir, SparkSession spark)
|
||||
throws IOException {
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||
|
||||
for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
|
||||
|
||||
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
|
||||
|
||||
final String rawset = pathQ.pollLast();
|
||||
final String actionSetDirectory = pathQ.pollLast();
|
||||
|
||||
final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
|
||||
|
||||
if (fs.exists(targetDirectory)) {
|
||||
log.info("found target directory '{}", targetDirectory);
|
||||
fs.delete(targetDirectory, true);
|
||||
log.info("deleted target directory '{}", targetDirectory);
|
||||
}
|
||||
|
||||
log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory);
|
||||
|
||||
sc
|
||||
.sequenceFile(sourcePath, Text.class, Text.class)
|
||||
.map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString()))
|
||||
.map(TransformActions::doTransform)
|
||||
.filter(Objects::nonNull)
|
||||
.mapToPair(
|
||||
a -> new Tuple2<>(a.getClazz().toString(), OBJECT_MAPPER.writeValueAsString(a)))
|
||||
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
|
||||
.saveAsNewAPIHadoopFile(
|
||||
targetDirectory.toString(),
|
||||
Text.class,
|
||||
Text.class,
|
||||
SequenceFileOutputFormat.class,
|
||||
sc.hadoopConfiguration());
|
||||
}
|
||||
}
|
||||
|
||||
private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa)
|
||||
throws InvalidProtocolBufferException {
|
||||
|
||||
// dedup similarity relations had empty target value, don't migrate them
|
||||
if (aa.getTargetValue().length == 0) {
|
||||
return null;
|
||||
}
|
||||
final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
|
||||
final Oaf oaf = ProtoConverter.convert(proto_oaf);
|
||||
switch (proto_oaf.getKind()) {
|
||||
case entity:
|
||||
switch (proto_oaf.getEntity().getType()) {
|
||||
case datasource:
|
||||
return new AtomicAction<>(Datasource.class, (Datasource) oaf);
|
||||
case organization:
|
||||
return new AtomicAction<>(Organization.class, (Organization) oaf);
|
||||
case project:
|
||||
return new AtomicAction<>(Project.class, (Project) oaf);
|
||||
case result:
|
||||
final String resulttypeid = proto_oaf
|
||||
.getEntity()
|
||||
.getResult()
|
||||
.getMetadata()
|
||||
.getResulttype()
|
||||
.getClassid();
|
||||
switch (resulttypeid) {
|
||||
case "publication":
|
||||
return new AtomicAction<>(Publication.class, (Publication) oaf);
|
||||
case "software":
|
||||
return new AtomicAction<>(Software.class, (Software) oaf);
|
||||
case "other":
|
||||
return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf);
|
||||
case "dataset":
|
||||
return new AtomicAction<>(Dataset.class, (Dataset) oaf);
|
||||
default:
|
||||
// can be an update, where the resulttype is not specified
|
||||
return new AtomicAction<>(Result.class, (Result) oaf);
|
||||
}
|
||||
default:
|
||||
throw new IllegalArgumentException(
|
||||
"invalid entity type: " + proto_oaf.getEntity().getType());
|
||||
}
|
||||
case relation:
|
||||
return new AtomicAction<>(Relation.class, (Relation) oaf);
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
|
||||
}
|
||||
}
|
||||
|
||||
private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
|
||||
return isLookUp.getResourceProfileByQuery(XQUERY);
|
||||
}
|
||||
}
|
|
@ -5,12 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|||
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -68,6 +68,12 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
|
||||
logger.info("strategy: {}", strategy);
|
||||
|
||||
Boolean shouldGroupById = Optional
|
||||
.ofNullable(parser.get("shouldGroupById"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(true);
|
||||
logger.info("shouldGroupById: {}", shouldGroupById);
|
||||
|
||||
Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
|
||||
Class<? extends Oaf> actionPayloadClazz = (Class<? extends Oaf>) Class.forName(actionPayloadClassName);
|
||||
|
||||
|
@ -89,7 +95,8 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
outputGraphTablePath,
|
||||
strategy,
|
||||
rowClazz,
|
||||
actionPayloadClazz);
|
||||
actionPayloadClazz,
|
||||
shouldGroupById);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -115,12 +122,12 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
String outputGraphTablePath,
|
||||
MergeAndGet.Strategy strategy,
|
||||
Class<G> rowClazz,
|
||||
Class<A> actionPayloadClazz) {
|
||||
Class<A> actionPayloadClazz, Boolean shouldGroupById) {
|
||||
Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz);
|
||||
Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz);
|
||||
|
||||
Dataset<G> result = promoteActionPayloadForGraphTable(
|
||||
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz)
|
||||
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById)
|
||||
.map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz));
|
||||
|
||||
saveGraphTable(result, outputGraphTablePath);
|
||||
|
@ -153,9 +160,9 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
|
||||
private static String extractPayload(Row value) {
|
||||
try {
|
||||
return value.<String> getAs("payload");
|
||||
return value.getAs("payload");
|
||||
} catch (IllegalArgumentException | ClassCastException e) {
|
||||
logger.error("cannot extract payload from action: {}", value.toString());
|
||||
logger.error("cannot extract payload from action: {}", value);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
@ -174,7 +181,8 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
Dataset<A> actionPayloadDS,
|
||||
MergeAndGet.Strategy strategy,
|
||||
Class<G> rowClazz,
|
||||
Class<A> actionPayloadClazz) {
|
||||
Class<A> actionPayloadClazz,
|
||||
Boolean shouldGroupById) {
|
||||
logger
|
||||
.info(
|
||||
"Promoting action payload for graph table: payload={}, table={}",
|
||||
|
@ -186,7 +194,7 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy);
|
||||
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy);
|
||||
SerializableSupplier<G> zeroFn = zeroFn(rowClazz);
|
||||
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource;
|
||||
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget;
|
||||
|
||||
Dataset<G> joinedAndMerged = PromoteActionPayloadFunctions
|
||||
.joinGraphTableWithActionPayloadAndMerge(
|
||||
|
@ -198,9 +206,13 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
rowClazz,
|
||||
actionPayloadClazz);
|
||||
|
||||
return PromoteActionPayloadFunctions
|
||||
.groupGraphTableByIdAndMerge(
|
||||
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
|
||||
if (shouldGroupById) {
|
||||
return PromoteActionPayloadFunctions
|
||||
.groupGraphTableByIdAndMerge(
|
||||
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
|
||||
} else {
|
||||
return joinedAndMerged;
|
||||
}
|
||||
}
|
||||
|
||||
private static <T extends Oaf> SerializableSupplier<T> zeroFn(Class<T> clazz) {
|
||||
|
@ -226,12 +238,13 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
}
|
||||
}
|
||||
|
||||
private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSource() {
|
||||
private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSourceAndTarget() {
|
||||
return t -> {
|
||||
if (isSubClass(t, Relation.class)) {
|
||||
return Objects.nonNull(((Relation) t).getSource());
|
||||
final Relation rel = (Relation) t;
|
||||
return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget());
|
||||
}
|
||||
return Objects.nonNull(((OafEntity) t).getId());
|
||||
return StringUtils.isNotBlank(((OafEntity) t).getId());
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -112,6 +112,7 @@ public class PromoteActionPayloadFunctions {
|
|||
Class<G> rowClazz) {
|
||||
TypedColumn<G, G> aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn();
|
||||
return rowDS
|
||||
.filter((FilterFunction<G>) o -> isNotZeroFn.get().apply(o))
|
||||
.groupByKey((MapFunction<G, String>) x -> rowIdFn.get().apply(x), Encoders.STRING())
|
||||
.agg(aggregator)
|
||||
.map((MapFunction<Tuple2<String, G>, G>) Tuple2::_2, Encoders.kryo(rowClazz));
|
||||
|
|
|
@ -1,56 +0,0 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "is",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "URL of the isLookUp Service",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "sn",
|
||||
"paramLongName": "sourceNameNode",
|
||||
"paramDescription": "nameNode of the source cluster",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "tn",
|
||||
"paramLongName": "targetNameNode",
|
||||
"paramDescription": "namoNode of the target cluster",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingDirectory",
|
||||
"paramDescription": "working directory",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "nm",
|
||||
"paramLongName": "distcp_num_maps",
|
||||
"paramDescription": "maximum number of map tasks used in the distcp process",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mm",
|
||||
"paramLongName": "distcp_memory_mb",
|
||||
"paramDescription": "memory for distcp action copying actionsets from remote cluster",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "tt",
|
||||
"paramLongName": "distcp_task_timeout",
|
||||
"paramDescription": "timeout for distcp copying actions from remote cluster",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "tr",
|
||||
"paramLongName": "transform_only",
|
||||
"paramDescription": "activate tranform-only mode. Only apply transformation step",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -1,20 +0,0 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "is",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "URL of the isLookUp Service",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "inputPaths",
|
||||
"paramDescription": "URL of the isLookUp Service",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -40,5 +40,11 @@
|
|||
"paramLongName": "mergeAndGetStrategy",
|
||||
"paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "sgid",
|
||||
"paramLongName": "shouldGroupById",
|
||||
"paramDescription": "indicates whether the promotion operation should group objects in the graph by id or not",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -24,6 +24,10 @@
|
|||
<name>mergeAndGetStrategy</name>
|
||||
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shouldGroupById</name>
|
||||
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -111,6 +115,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="DecisionPromoteResultActionPayloadForDatasetTable"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -162,6 +167,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -56,6 +56,11 @@
|
|||
<name>mergeAndGetStrategy</name>
|
||||
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shouldGroupById</name>
|
||||
<value>false</value>
|
||||
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
|
|
@ -1,138 +0,0 @@
|
|||
<workflow-app xmlns='uri:oozie:workflow:0.5' name='migrate_actions'>
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourceNN</name>
|
||||
<description>the source name node</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>the isLookup service endpoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingDirectory</name>
|
||||
<description>working directory</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>distcp_memory_mb</name>
|
||||
<value>6144</value>
|
||||
<description>memory for distcp copying actionsets from remote cluster</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>distcp_task_timeout</name>
|
||||
<value>60000000</value>
|
||||
<description>timeout for distcp copying actions from remote cluster</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>distcp_num_maps</name>
|
||||
<value>1</value>
|
||||
<description>mmaximum number of map tasks used in the distcp process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>transform_only</name>
|
||||
<description>activate tranform-only mode. Only apply transformation step</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="migrate_actionsets"/>
|
||||
|
||||
<action name="migrate_actionsets">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.actionmanager.migration.MigrateActionSet</main-class>
|
||||
<java-opt>-Dmapred.task.timeout=${distcp_task_timeout}</java-opt>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--sourceNameNode</arg><arg>${sourceNN}</arg>
|
||||
<arg>--targetNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--workingDirectory</arg><arg>${workingDirectory}</arg>
|
||||
<arg>--distcp_num_maps</arg><arg>${distcp_num_maps}</arg>
|
||||
<arg>--distcp_memory_mb</arg><arg>${distcp_memory_mb}</arg>
|
||||
<arg>--distcp_task_timeout</arg><arg>${distcp_task_timeout}</arg>
|
||||
<arg>--transform_only</arg><arg>${transform_only}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="transform_actions" />
|
||||
<error to="fail" />
|
||||
</action>
|
||||
|
||||
<action name="transform_actions">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>transform_actions</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.migration.TransformActions</class>
|
||||
<jar>dhp-actionmanager-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--inputPaths</arg><arg>${wf:actionData('migrate_actionsets')['target_paths']}</arg>
|
||||
</spark>
|
||||
<ok to="end"/>
|
||||
<error to="fail"/>
|
||||
</action>
|
||||
|
||||
<kill name="fail">
|
||||
<message>migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<end name="end" />
|
||||
|
||||
</workflow-app>
|
|
@ -24,6 +24,10 @@
|
|||
<name>mergeAndGetStrategy</name>
|
||||
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shouldGroupById</name>
|
||||
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -110,6 +114,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -161,6 +166,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -24,6 +24,10 @@
|
|||
<name>mergeAndGetStrategy</name>
|
||||
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shouldGroupById</name>
|
||||
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -111,6 +115,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="DecisionPromoteResultActionPayloadForPublicationTable"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -162,6 +167,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -24,6 +24,10 @@
|
|||
<name>mergeAndGetStrategy</name>
|
||||
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shouldGroupById</name>
|
||||
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -110,6 +114,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -161,6 +166,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -101,7 +101,9 @@ public class PromoteActionPayloadForGraphTableJobTest {
|
|||
"-outputGraphTablePath",
|
||||
"",
|
||||
"-mergeAndGetStrategy",
|
||||
MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name()
|
||||
MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name(),
|
||||
"--shouldGroupById",
|
||||
"true"
|
||||
}));
|
||||
|
||||
// then
|
||||
|
@ -141,7 +143,9 @@ public class PromoteActionPayloadForGraphTableJobTest {
|
|||
"-outputGraphTablePath",
|
||||
outputGraphTableDir.toString(),
|
||||
"-mergeAndGetStrategy",
|
||||
strategy.name()
|
||||
strategy.name(),
|
||||
"--shouldGroupById",
|
||||
"true"
|
||||
});
|
||||
|
||||
// then
|
||||
|
|
|
@ -1,29 +1,27 @@
|
|||
Description of the Module
|
||||
--------------------------
|
||||
This module defines a **collector worker application** that runs on Hadoop.
|
||||
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
|
||||
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
|
||||
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
|
||||
of each MDStore.
|
||||
|
||||
It is responsible for harvesting metadata using different plugins.
|
||||
## Metadata collection
|
||||
|
||||
The collector worker uses a message queue to inform the progress
|
||||
of the harvesting action (using a message queue for sending **ONGOING** messages) furthermore,
|
||||
It gives, at the end of the job, some information about the status
|
||||
of the collection i.e Number of records collected(using a message queue for sending **REPORT** messages).
|
||||
The **metadata collection workflow** is responsible for harvesting metadata records from different protocols and responding to
|
||||
different formats and to store them as on HDFS so that they can be further processed.
|
||||
|
||||
To work the collection worker need some parameter like:
|
||||
### Collector Plugins
|
||||
|
||||
* **hdfsPath**: the path where storing the sequential file
|
||||
* **apidescriptor**: the JSON encoding of the API Descriptor
|
||||
* **namenode**: the Name Node URI
|
||||
* **userHDFS**: the user wich create the hdfs seq file
|
||||
* **rabbitUser**: the user to connect with RabbitMq for messaging
|
||||
* **rabbitPassWord**: the password to connect with RabbitMq for messaging
|
||||
* **rabbitHost**: the host of the RabbitMq server
|
||||
* **rabbitOngoingQueue**: the name of the ongoing queue
|
||||
* **rabbitReportQueue**: the name of the report queue
|
||||
* **workflowId**: the identifier of the dnet Workflow
|
||||
Different protocols are managed by dedicated Collector plugins, i.e. java programs implementing a defined interface:
|
||||
|
||||
##Plugins
|
||||
* OAI Plugin
|
||||
```eu.dnetlib.dhp.collection.plugin.CollectorPlugin```
|
||||
|
||||
The list of the supported plugins:
|
||||
|
||||
* OAI Plugin: collects from OAI-PMH compatible endpoints
|
||||
* MDStore plugin: collects from a given D-Net MetadataStore, (identified by moogodb URI, dbName, MDStoreID)
|
||||
* MDStore dump plugin: collects from an MDStore dump stored on the HDFS location indicated by the `path` parameter
|
||||
|
||||
# Transformation Plugins
|
||||
TODO
|
||||
|
||||
## Usage
|
||||
TODO
|
|
@ -7,10 +7,44 @@
|
|||
<version>1.2.4-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>${net.alchim31.maven.version}</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
|
@ -24,19 +58,7 @@
|
|||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.sun.xml.bind</groupId>
|
||||
<artifactId>jaxb-core</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
|
@ -57,6 +79,11 @@
|
|||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.json</groupId>
|
||||
<artifactId>json</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
|
@ -77,8 +104,11 @@
|
|||
<artifactId>commons-compress</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
</project>
|
|
@ -75,7 +75,6 @@ public class CollectAndSave implements Serializable {
|
|||
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
;
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
|
|
|
@ -36,7 +36,7 @@ import scala.Tuple2;
|
|||
*/
|
||||
public class SparkAtomicActionScoreJob implements Serializable {
|
||||
|
||||
private static String DOI = "doi";
|
||||
private static final String DOI = "doi";
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
|
||||
import org.apache.http.entity.StringEntity
|
||||
import org.apache.http.impl.client.HttpClients
|
||||
|
||||
import java.io.IOException
|
||||
|
||||
abstract class AbstractRestClient extends Iterator[String]{
|
||||
|
||||
var buffer: List[String] = List()
|
||||
var current_index:Int = 0
|
||||
|
||||
var scroll_value: Option[String] = None
|
||||
|
||||
var complete:Boolean = false
|
||||
|
||||
|
||||
def extractInfo(input: String): Unit
|
||||
|
||||
protected def getBufferData(): Unit
|
||||
|
||||
|
||||
def doHTTPGETRequest(url:String): String = {
|
||||
val httpGet = new HttpGet(url)
|
||||
doHTTPRequest(httpGet)
|
||||
|
||||
}
|
||||
|
||||
def doHTTPPOSTRequest(url:String, json:String): String = {
|
||||
val httpPost = new HttpPost(url)
|
||||
if (json != null) {
|
||||
val entity = new StringEntity(json)
|
||||
httpPost.setEntity(entity)
|
||||
httpPost.setHeader("Accept", "application/json")
|
||||
httpPost.setHeader("Content-type", "application/json")
|
||||
}
|
||||
doHTTPRequest(httpPost)
|
||||
}
|
||||
|
||||
def hasNext: Boolean = {
|
||||
buffer.nonEmpty && current_index < buffer.size
|
||||
}
|
||||
|
||||
|
||||
override def next(): String = {
|
||||
val next_item:String = buffer(current_index)
|
||||
current_index = current_index + 1
|
||||
if (current_index == buffer.size)
|
||||
getBufferData()
|
||||
next_item
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={
|
||||
val client = HttpClients.createDefault
|
||||
var tries = 4
|
||||
try {
|
||||
while (tries > 0) {
|
||||
|
||||
println(s"requesting ${r.getURI}")
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
}
|
||||
""
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
throw new RuntimeException("Error on executing request ", e)
|
||||
} finally try client.close()
|
||||
catch {
|
||||
case e: IOException =>
|
||||
throw new RuntimeException("Unable to close client ", e)
|
||||
}
|
||||
}
|
||||
|
||||
getBufferData()
|
||||
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import org.json4s.{DefaultFormats, JValue}
|
||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
|
||||
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
|
||||
|
||||
override def extractInfo(input: String): Unit = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
buffer = (json \ "data").extract[List[JValue]].map(s => compact(render(s)))
|
||||
val next_url = (json \ "links" \ "next").extractOrElse[String](null)
|
||||
scroll_value = if (next_url != null && next_url.nonEmpty) Some(next_url) else None
|
||||
if (scroll_value.isEmpty)
|
||||
complete = true
|
||||
current_index = 0
|
||||
}
|
||||
|
||||
def get_url():String ={
|
||||
val to = if (until> 0) s"$until" else "*"
|
||||
s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"
|
||||
|
||||
}
|
||||
|
||||
override def getBufferData(): Unit = {
|
||||
if (!complete) {
|
||||
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
|
||||
extractInfo(response)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,593 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import java.nio.charset.CodingErrorAction
|
||||
import java.text.SimpleDateFormat
|
||||
import java.time.LocalDate
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util
|
||||
import java.util.regex.Pattern
|
||||
import java.util.{Date, Locale}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.{Codec, Source}
|
||||
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
|
||||
|
||||
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
|
||||
|
||||
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
|
||||
|
||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||
|
||||
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
||||
|
||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||
|
||||
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
|
||||
|
||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||
|
||||
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
|
||||
|
||||
object DataciteToOAFTransformation {
|
||||
|
||||
val REL_TYPE_VALUE:String = "resultResult"
|
||||
|
||||
val subRelTypeMapping: Map[String,(String,String)] = Map(
|
||||
"References" ->("IsReferencedBy","relationship"),
|
||||
"IsSupplementTo" ->("IsSupplementedBy","supplement"),
|
||||
"IsPartOf" ->("HasPart","part"),
|
||||
"HasPart" ->("IsPartOf","part"),
|
||||
"IsVersionOf" ->("HasVersion","version"),
|
||||
"HasVersion" ->("IsVersionOf","version"),
|
||||
"IsIdenticalTo" ->("IsIdenticalTo","relationship"),
|
||||
"IsPreviousVersionOf" ->("IsNewVersionOf","version"),
|
||||
"IsContinuedBy" ->("Continues","relationship"),
|
||||
"Continues" ->("IsContinuedBy","relationship"),
|
||||
"IsNewVersionOf" ->("IsPreviousVersionOf","version"),
|
||||
"IsSupplementedBy" ->("IsSupplementTo","supplement"),
|
||||
"IsDocumentedBy" ->("Documents","relationship"),
|
||||
"IsSourceOf" ->("IsDerivedFrom","relationship"),
|
||||
"Cites" ->("IsCitedBy","citation"),
|
||||
"IsCitedBy" ->("Cites","citation"),
|
||||
"IsDerivedFrom" ->("IsSourceOf","relationship"),
|
||||
"IsVariantFormOf" ->("IsDerivedFrom","version"),
|
||||
"IsReferencedBy" ->("References","relationship"),
|
||||
"IsObsoletedBy" ->("IsNewVersionOf","version"),
|
||||
"Reviews" ->("IsReviewedBy","review"),
|
||||
"Documents" ->("IsDocumentedBy","relationship"),
|
||||
"IsCompiledBy" ->("Compiles","relationship"),
|
||||
"Compiles" ->("IsCompiledBy","relationship"),
|
||||
"IsReviewedBy" ->("Reviews","review")
|
||||
)
|
||||
|
||||
implicit val codec: Codec = Codec("UTF-8")
|
||||
codec.onMalformedInput(CodingErrorAction.REPLACE)
|
||||
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
|
||||
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
|
||||
|
||||
val j_filter: List[String] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
|
||||
s.lines.toList
|
||||
}
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
|
||||
|
||||
val dataInfo: DataInfo = generateDataInfo("0.9")
|
||||
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
|
||||
|
||||
val hostedByMap: Map[String, HostedByMapType] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(s)
|
||||
json.extract[Map[String, HostedByMapType]]
|
||||
}
|
||||
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
|
||||
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
|
||||
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||
//M-D-Y
|
||||
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
||||
//D-M-Y
|
||||
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
)
|
||||
|
||||
|
||||
def filter_json(json: String): Boolean = {
|
||||
j_filter.exists(f => json.contains(f))
|
||||
}
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
def embargo_end(embargo_end_date: String): Boolean = {
|
||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||
val td = LocalDate.now()
|
||||
td.isAfter(dt)
|
||||
}
|
||||
|
||||
|
||||
def extract_date(input: String): Option[String] = {
|
||||
val d = Date_regex.map(pattern => {
|
||||
val matcher = pattern.matcher(input)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
}
|
||||
).find(s => s != null)
|
||||
|
||||
if (d.isDefined) {
|
||||
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_en).toString)
|
||||
} catch {
|
||||
case _: Throwable => try {
|
||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
return None
|
||||
}
|
||||
}
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
|
||||
if (resourceType != null && resourceType.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
}
|
||||
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
|
||||
}
|
||||
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
|
||||
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (typeQualifiers == null)
|
||||
return null
|
||||
val i = new Instance
|
||||
i.setInstancetype(typeQualifiers._1)
|
||||
typeQualifiers._2.getClassname match {
|
||||
case "dataset" =>
|
||||
val r = new OafDataset
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "publication" =>
|
||||
val r = new Publication
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "software" =>
|
||||
val r = new Software
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "other" =>
|
||||
val r = new OtherResearchProduct
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def available_date(input: String): Boolean = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
val l: List[String] = for {
|
||||
JObject(dates) <- json \\ "dates"
|
||||
JField("dateType", JString(dateTypes)) <- dates
|
||||
} yield dateTypes
|
||||
|
||||
l.exists(p => p.equalsIgnoreCase("available"))
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* As describe in ticket #6377
|
||||
* when the result come from figshare we need to remove subject
|
||||
* and set Access rights OPEN.
|
||||
*
|
||||
* @param r
|
||||
*/
|
||||
def fix_figshare(r: Result): Unit = {
|
||||
|
||||
if (r.getInstance() != null) {
|
||||
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
if (hosted_by_figshare) {
|
||||
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
||||
val l: List[StructuredProperty] = List()
|
||||
r.setSubject(l.asJava)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
||||
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
||||
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
||||
}
|
||||
|
||||
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
||||
OafMapperUtils.structuredProperty(dt, q, null)
|
||||
}
|
||||
|
||||
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
|
||||
|
||||
val r = new Relation
|
||||
r.setSource(sourceId)
|
||||
r.setTarget(targetId)
|
||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
||||
r.setRelClass(relClass)
|
||||
r.setSubRelType(ModelConstants.OUTCOME)
|
||||
r.setCollectedfrom(List(cf).asJava)
|
||||
r.setDataInfo(di)
|
||||
r
|
||||
|
||||
|
||||
}
|
||||
|
||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
||||
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
|
||||
|
||||
if (match_pattern.isDefined) {
|
||||
val m = match_pattern.get._1
|
||||
val p = match_pattern.get._2
|
||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||
List(
|
||||
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
|
||||
generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
|
||||
)
|
||||
}
|
||||
else
|
||||
List()
|
||||
|
||||
}
|
||||
|
||||
|
||||
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
|
||||
if (filter_json(input))
|
||||
return List()
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
||||
|
||||
val doi = (json \ "attributes" \ "doi").extract[String]
|
||||
if (doi.isEmpty)
|
||||
return List()
|
||||
|
||||
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
|
||||
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (result == null)
|
||||
return List()
|
||||
|
||||
|
||||
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
|
||||
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||
result.setPid(List(pid).asJava)
|
||||
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||
result.setOriginalId(List(doi).asJava)
|
||||
|
||||
val d = new Date(dateOfCollection * 1000)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
|
||||
|
||||
result.setDateofcollection(ISO8601FORMAT.format(d))
|
||||
result.setDateoftransformation(ISO8601FORMAT.format(ts))
|
||||
result.setDataInfo(dataInfo)
|
||||
|
||||
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
||||
|
||||
|
||||
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
||||
val a = new Author
|
||||
a.setFullname(c.name.orNull)
|
||||
a.setName(c.givenName.orNull)
|
||||
a.setSurname(c.familyName.orNull)
|
||||
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||
a.setPid(c.nameIdentifiers.get.map(ni => {
|
||||
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
|
||||
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
||||
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||
}
|
||||
else
|
||||
null
|
||||
|
||||
}
|
||||
)
|
||||
.asJava)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
|
||||
a.setRank(idx + 1)
|
||||
a
|
||||
}
|
||||
|
||||
|
||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||
|
||||
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
|
||||
if (t.titleType.isEmpty) {
|
||||
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
||||
} else {
|
||||
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
|
||||
}
|
||||
}).asJava)
|
||||
|
||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||
return List()
|
||||
result.setAuthor(authors.asJava)
|
||||
|
||||
val dates = (json \\ "dates").extract[List[DateType]]
|
||||
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
|
||||
|
||||
val i_date = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
val a_date: Option[String] = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
.find(d => d != null && d.isDefined)
|
||||
.map(d => d.get)
|
||||
|
||||
if (a_date.isDefined) {
|
||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||
}
|
||||
if (i_date.isDefined && i_date.get.isDefined) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
}
|
||||
else if (publication_year != null) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
}
|
||||
|
||||
|
||||
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||
.filter(d => d._1.isDefined)
|
||||
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
|
||||
.filter(d => d._2 != null)
|
||||
.map(d => generateOAFDate(d._1, d._2)).asJava)
|
||||
|
||||
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
||||
|
||||
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
||||
).asJava)
|
||||
|
||||
|
||||
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
|
||||
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
||||
|
||||
result.setDescription(
|
||||
descriptions
|
||||
.filter(d => d.description.isDefined).
|
||||
map(d =>
|
||||
OafMapperUtils.field(d.description.get, null)
|
||||
).filter(s => s != null).asJava)
|
||||
|
||||
|
||||
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
||||
if (publisher != null)
|
||||
result.setPublisher(OafMapperUtils.field(publisher, null))
|
||||
|
||||
|
||||
val language: String = (json \\ "language").extractOrElse[String](null)
|
||||
|
||||
if (language != null)
|
||||
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
|
||||
|
||||
|
||||
val instance = result.getInstance().get(0)
|
||||
|
||||
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
|
||||
|
||||
val accessRights: List[String] = for {
|
||||
JObject(rightsList) <- json \\ "rightsList"
|
||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||
} yield rightsUri
|
||||
|
||||
val aRights: Option[AccessRight] = accessRights.map(r => {
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||
}).find(q => q != null).map(q => {
|
||||
val a = new AccessRight
|
||||
a.setClassid(q.getClassid)
|
||||
a.setClassname(q.getClassname)
|
||||
a.setSchemeid(q.getSchemeid)
|
||||
a.setSchemename(q.getSchemename)
|
||||
a
|
||||
})
|
||||
|
||||
|
||||
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
|
||||
if (client.isDefined) {
|
||||
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
|
||||
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
|
||||
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
||||
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
||||
instance.setAccessright(access_rights_qualifier)
|
||||
instance.setPid(result.getPid)
|
||||
val license = accessRights
|
||||
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
|
||||
if (license.isDefined)
|
||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||
}
|
||||
|
||||
val awardUris: List[String] = for {
|
||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||
} yield awardUri
|
||||
|
||||
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
|
||||
|
||||
fix_figshare(result)
|
||||
result.setId(IdentifierFactory.createIdentifier(result))
|
||||
if (result.getId == null)
|
||||
return List()
|
||||
|
||||
if (exportLinks) {
|
||||
val rels: List[RelatedIdentifierType] = for {
|
||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
||||
JField("relationType", JString(relationType)) <- relIdentifier
|
||||
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
|
||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
||||
|
||||
|
||||
relations = relations ::: generateRelations(rels,result.getId)
|
||||
}
|
||||
if (relations != null && relations.nonEmpty) {
|
||||
List(result) ::: relations
|
||||
}
|
||||
else
|
||||
List(result)
|
||||
}
|
||||
|
||||
private def generateRelations(rels: List[RelatedIdentifierType], id:String):List[Relation] = {
|
||||
rels
|
||||
.filter(r =>
|
||||
subRelTypeMapping.contains(r.relationType) && (
|
||||
r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||
)
|
||||
.flatMap(r => {
|
||||
val rel = new Relation
|
||||
val inverseRel = new Relation
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.setDataInfo(dataInfo)
|
||||
|
||||
inverseRel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
inverseRel.setDataInfo(dataInfo)
|
||||
|
||||
val subRelType = subRelTypeMapping(r.relationType)._2
|
||||
val inverseRelSemantic = subRelTypeMapping(r.relationType)._1
|
||||
val inversesubRelType = subRelTypeMapping(inverseRelSemantic)._2
|
||||
|
||||
|
||||
rel.setRelType(REL_TYPE_VALUE)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(r.relationType)
|
||||
|
||||
|
||||
inverseRel.setRelType(REL_TYPE_VALUE)
|
||||
inverseRel.setSubRelType(inversesubRelType)
|
||||
inverseRel.setRelClass(inverseRelSemantic)
|
||||
|
||||
rel.setSource(id)
|
||||
rel.setTarget(createDNetTargetIdentifier(r.relatedIdentifier, r.relatedIdentifierType, "50|"))
|
||||
|
||||
inverseRel.setTarget(id)
|
||||
inverseRel.setSource(createDNetTargetIdentifier(r.relatedIdentifier, r.relatedIdentifierType, "50|"))
|
||||
|
||||
List(rel, inverseRel)
|
||||
})
|
||||
}
|
||||
|
||||
def generateDataInfo(trust: String): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
|
||||
di
|
||||
}
|
||||
|
||||
def generateDSId(input: String): String = {
|
||||
val b = StringUtils.substringBefore(input, "::")
|
||||
val a = StringUtils.substringAfter(input, "::")
|
||||
s"10|$b::${DHPUtils.md5(a)}"
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object ExportActionSetJobNode {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(ExportActionSetJobNode.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
|
||||
|
||||
spark.read.load(sourcePath).as[Oaf]
|
||||
.map(o =>DataciteToOAFTransformation.toActionSet(o))
|
||||
.filter(o => o!= null)
|
||||
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object GenerateDataciteDatasetSpark {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
||||
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
|
||||
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
spark.read.load(sourcePath).as[DataciteType]
|
||||
.filter(d => d.isActive)
|
||||
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
|
||||
.filter(d => d != null)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,186 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
|
||||
import org.apache.hadoop.hdfs.DistributedFileSystem
|
||||
import org.apache.hadoop.io.{IntWritable, SequenceFile, Text}
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.apache.spark.sql.functions.max
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import java.time.format.DateTimeFormatter._
|
||||
import java.time.{LocalDate, LocalDateTime, ZoneOffset}
|
||||
import scala.io.Source
|
||||
|
||||
object ImportDatacite {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
|
||||
|
||||
|
||||
def convertAPIStringToDataciteItem(input: String): DataciteType = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase
|
||||
|
||||
val isActive = (json \ "attributes" \ "isActive").extract[Boolean]
|
||||
|
||||
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
|
||||
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
|
||||
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
|
||||
val hdfsuri = parser.get("namenode")
|
||||
log.info(s"namenode is $hdfsuri")
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath is $targetPath")
|
||||
|
||||
val dataciteDump = parser.get("dataciteDumpPath")
|
||||
log.info(s"dataciteDump is $dataciteDump")
|
||||
|
||||
val hdfsTargetPath = new Path(targetPath)
|
||||
log.info(s"hdfsTargetPath is $hdfsTargetPath")
|
||||
|
||||
val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt
|
||||
|
||||
val spkipImport = parser.get("skipImport")
|
||||
log.info(s"skipImport is $spkipImport")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder()
|
||||
.appName(ImportDatacite.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
||||
// ====== Init HDFS File System Object
|
||||
val conf = new Configuration
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri)
|
||||
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName)
|
||||
conf.set("fs.file.impl", classOf[LocalFileSystem].getName)
|
||||
val sc: SparkContext = spark.sparkContext
|
||||
sc.setLogLevel("ERROR")
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
||||
|
||||
override def zero: DataciteType = null
|
||||
|
||||
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
if (b == null)
|
||||
return a
|
||||
if (a == null)
|
||||
return b
|
||||
if (a.timestamp > b.timestamp) {
|
||||
return a
|
||||
}
|
||||
b
|
||||
}
|
||||
|
||||
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
reduce(a, b)
|
||||
}
|
||||
|
||||
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def finish(reduction: DataciteType): DataciteType = reduction
|
||||
}
|
||||
|
||||
val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
|
||||
val ts = dump.select(max("timestamp")).first().getLong(0)
|
||||
|
||||
println(s"last Timestamp is $ts")
|
||||
|
||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||
|
||||
println(s"Imported from Datacite API $cnt documents")
|
||||
|
||||
if (cnt > 0) {
|
||||
|
||||
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
||||
.map(s => s._2.toString)
|
||||
.map(s => convertAPIStringToDataciteItem(s))
|
||||
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
|
||||
|
||||
val ds: Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType]
|
||||
|
||||
dump
|
||||
.union(ds)
|
||||
.groupByKey(_.doi)
|
||||
.agg(dataciteAggregator.toColumn)
|
||||
.map(s => s._2)
|
||||
.repartition(4000)
|
||||
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
|
||||
|
||||
val fs = FileSystem.get(sc.hadoopConfiguration)
|
||||
fs.delete(new Path(s"$dataciteDump"), true)
|
||||
fs.rename(new Path(s"${dataciteDump}_updated"), new Path(s"$dataciteDump"))
|
||||
}
|
||||
}
|
||||
|
||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
|
||||
var from:Long = timestamp * 1000
|
||||
val delta:Long = 50000000L
|
||||
var client: DataciteAPIImporter = null
|
||||
val now :Long =System.currentTimeMillis()
|
||||
var i = 0
|
||||
try {
|
||||
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
|
||||
try {
|
||||
var start: Long = System.currentTimeMillis
|
||||
while (from < now) {
|
||||
client = new DataciteAPIImporter(from, bs, from + delta)
|
||||
var end: Long = 0
|
||||
val key: IntWritable = new IntWritable(i)
|
||||
val value: Text = new Text
|
||||
while (client.hasNext) {
|
||||
key.set({
|
||||
i += 1;
|
||||
i - 1
|
||||
})
|
||||
value.set(client.next())
|
||||
writer.append(key, value)
|
||||
writer.hflush()
|
||||
if (i % 1000 == 0) {
|
||||
end = System.currentTimeMillis
|
||||
val time = (end - start) / 1000.0F
|
||||
println(s"Imported $i in $time seconds")
|
||||
start = System.currentTimeMillis
|
||||
}
|
||||
}
|
||||
println(s"updating from value: $from -> ${from+delta}")
|
||||
from = from + delta
|
||||
}
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
println("Error", e)
|
||||
} finally if (writer != null) writer.close()
|
||||
}
|
||||
catch {
|
||||
case e: Throwable =>
|
||||
log.error("Error", e)
|
||||
}
|
||||
i
|
||||
}
|
||||
|
||||
}
|
|
@ -248,7 +248,7 @@ public class PrepareProgramme {
|
|||
parent = parent.substring(parent.lastIndexOf("|") + 1).trim();
|
||||
}
|
||||
if (current.trim().length() > parent.length()
|
||||
&& current.toLowerCase().trim().substring(0, parent.length()).equals(parent)) {
|
||||
&& current.toLowerCase().trim().startsWith(parent)) {
|
||||
current = current.substring(parent.length() + 1);
|
||||
if (current.trim().charAt(0) == '-' || current.trim().charAt(0) == '–') {
|
||||
current = current.trim().substring(1).trim();
|
||||
|
|
|
@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
|
@ -33,7 +32,6 @@ public class PrepareProjects {
|
|||
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
|
@ -93,7 +91,7 @@ public class PrepareProjects {
|
|||
}
|
||||
|
||||
private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
|
||||
return (FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject>) value -> {
|
||||
return value -> {
|
||||
Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
|
||||
List<CSVProject> csvProjectList = new ArrayList<>();
|
||||
if (csvProject.isPresent()) {
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
|
||||
|
||||
import java.util.LinkedList;
|
||||
|
||||
public class CollectorPluginErrorLogList extends LinkedList<String> {
|
||||
|
||||
private static final long serialVersionUID = -6925786561303289704L;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
String log = new String();
|
||||
int index = 0;
|
||||
for (String errorMessage : this) {
|
||||
log += String.format("Retry #%s: %s / ", index++, errorMessage);
|
||||
}
|
||||
return log;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
|
||||
|
||||
public class CollectorServiceException extends Exception {
|
||||
|
||||
private static final long serialVersionUID = 7523999812098059764L;
|
||||
|
||||
public CollectorServiceException(String string) {
|
||||
super(string);
|
||||
}
|
||||
|
||||
public CollectorServiceException(String string, Throwable exception) {
|
||||
super(string, exception);
|
||||
}
|
||||
|
||||
public CollectorServiceException(Throwable exception) {
|
||||
super(exception);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,240 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.*;
|
||||
import java.security.GeneralSecurityException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.net.ssl.HttpsURLConnection;
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* @author jochen, michele, andrea
|
||||
*/
|
||||
public class HttpConnector {
|
||||
|
||||
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
||||
|
||||
private int maxNumberOfRetry = 6;
|
||||
private int defaultDelay = 120; // seconds
|
||||
private int readTimeOut = 120; // seconds
|
||||
|
||||
private String responseType = null;
|
||||
|
||||
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
||||
|
||||
public HttpConnector() {
|
||||
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content via HTTP GET
|
||||
*
|
||||
* @param requestUrl the URL
|
||||
* @return the content of the downloaded resource
|
||||
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public String getInputSource(final String requestUrl) throws CollectorServiceException {
|
||||
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content as a stream via HTTP GET
|
||||
*
|
||||
* @param requestUrl the URL
|
||||
* @return the content of the downloaded resource as InputStream
|
||||
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
|
||||
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
}
|
||||
|
||||
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber,
|
||||
final CollectorPluginErrorLogList errorList)
|
||||
throws CollectorServiceException {
|
||||
try {
|
||||
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
try {
|
||||
return IOUtils.toString(s);
|
||||
} catch (IOException e) {
|
||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(e.getMessage());
|
||||
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(s);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
|
||||
final CollectorPluginErrorLogList errorList)
|
||||
throws CollectorServiceException {
|
||||
|
||||
if (retryNumber > maxNumberOfRetry) {
|
||||
throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList);
|
||||
}
|
||||
|
||||
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
||||
try {
|
||||
InputStream input = null;
|
||||
|
||||
try {
|
||||
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
||||
urlConn.setInstanceFollowRedirects(false);
|
||||
urlConn.setReadTimeout(readTimeOut * 1000);
|
||||
urlConn.addRequestProperty("User-Agent", userAgent);
|
||||
|
||||
if (log.isDebugEnabled()) {
|
||||
logHeaderFields(urlConn);
|
||||
}
|
||||
|
||||
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
||||
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
||||
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
||||
Thread.sleep(retryAfter * 1000);
|
||||
errorList.add("503 Service Unavailable");
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)
|
||||
|| (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) {
|
||||
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
||||
log.debug("The requested url has been moved to " + newUrl);
|
||||
errorList
|
||||
.add(
|
||||
String
|
||||
.format(
|
||||
"%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(),
|
||||
newUrl));
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
||||
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
||||
log
|
||||
.error(
|
||||
String
|
||||
.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
} else {
|
||||
input = urlConn.getInputStream();
|
||||
responseType = urlConn.getContentType();
|
||||
return input;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(e.getMessage());
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
||||
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
||||
|
||||
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
||||
if (e.getKey() != null) {
|
||||
for (String v : e.getValue()) {
|
||||
log.debug(" key: " + e.getKey() + " - value: " + v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
||||
for (String key : headerMap.keySet()) {
|
||||
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0)
|
||||
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
||||
return Integer
|
||||
.parseInt(headerMap.get(key).get(0)) + 10;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
|
||||
for (String key : headerMap.keySet()) {
|
||||
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) {
|
||||
return headerMap.get(key).get(0);
|
||||
}
|
||||
}
|
||||
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
|
||||
}
|
||||
|
||||
/**
|
||||
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
||||
*/
|
||||
public void initTrustManager() {
|
||||
final X509TrustManager tm = new X509TrustManager() {
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
try {
|
||||
final SSLContext ctx = SSLContext.getInstance("TLS");
|
||||
ctx.init(null, new TrustManager[] {
|
||||
tm
|
||||
}, null);
|
||||
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
||||
} catch (GeneralSecurityException e) {
|
||||
log.fatal(e);
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public int getMaxNumberOfRetry() {
|
||||
return maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
||||
this.maxNumberOfRetry = maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public int getDefaultDelay() {
|
||||
return defaultDelay;
|
||||
}
|
||||
|
||||
public void setDefaultDelay(final int defaultDelay) {
|
||||
this.defaultDelay = defaultDelay;
|
||||
}
|
||||
|
||||
public int getReadTimeOut() {
|
||||
return readTimeOut;
|
||||
}
|
||||
|
||||
public void setReadTimeOut(final int readTimeOut) {
|
||||
this.readTimeOut = readTimeOut;
|
||||
}
|
||||
|
||||
public String getResponseType() {
|
||||
return responseType;
|
||||
}
|
||||
|
||||
}
|
|
@ -17,8 +17,8 @@ import org.apache.hadoop.fs.Path;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||
|
||||
/**
|
||||
* Applies the parsing of a csv file and writes the Serialization of it in hdfs
|
||||
|
@ -28,7 +28,7 @@ public class ReadCSV implements Closeable {
|
|||
private final Configuration conf;
|
||||
private final BufferedWriter writer;
|
||||
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private String csvFile;
|
||||
private final String csvFile;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -74,7 +74,7 @@ public class ReadCSV implements Closeable {
|
|||
throws Exception {
|
||||
this.conf = new Configuration();
|
||||
this.conf.set("fs.defaultFS", hdfsNameNode);
|
||||
HttpConnector httpConnector = new HttpConnector();
|
||||
HttpConnector2 httpConnector = new HttpConnector2();
|
||||
FileSystem fileSystem = FileSystem.get(this.conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
|
@ -85,7 +85,6 @@ public class ReadCSV implements Closeable {
|
|||
|
||||
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
this.csvFile = httpConnector.getInputSource(fileURL);
|
||||
;
|
||||
}
|
||||
|
||||
protected void write(final Object p) {
|
||||
|
|
|
@ -14,19 +14,18 @@ import org.apache.hadoop.fs.Path;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||
|
||||
/**
|
||||
* Applies the parsing of an excel file and writes the Serialization of it in hdfs
|
||||
*/
|
||||
|
||||
public class ReadExcel implements Closeable {
|
||||
private static final Log log = LogFactory.getLog(ReadCSV.class);
|
||||
private final Configuration conf;
|
||||
private final BufferedWriter writer;
|
||||
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private InputStream excelFile;
|
||||
private final InputStream excelFile;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -73,7 +72,7 @@ public class ReadExcel implements Closeable {
|
|||
throws Exception {
|
||||
this.conf = new Configuration();
|
||||
this.conf.set("fs.defaultFS", hdfsNameNode);
|
||||
HttpConnector httpConnector = new HttpConnector();
|
||||
HttpConnector2 httpConnector = new HttpConnector2();
|
||||
FileSystem fileSystem = FileSystem.get(this.conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
|
@ -84,7 +83,6 @@ public class ReadExcel implements Closeable {
|
|||
|
||||
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
this.excelFile = httpConnector.getInputSourceAsStream(fileURL);
|
||||
;
|
||||
}
|
||||
|
||||
protected void write(final Object p) {
|
||||
|
|
|
@ -3,11 +3,11 @@ package eu.dnetlib.dhp.actionmanager.ror;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
|
||||
package eu.dnetlib.dhp.aggregation.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
||||
public class AggregationCounter implements Serializable {
|
||||
private LongAccumulator totalItems;
|
||||
private LongAccumulator errorItems;
|
||||
private LongAccumulator processedItems;
|
||||
|
||||
public AggregationCounter() {
|
||||
}
|
||||
|
||||
public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) {
|
||||
this.totalItems = totalItems;
|
||||
this.errorItems = errorItems;
|
||||
this.processedItems = processedItems;
|
||||
}
|
||||
|
||||
public LongAccumulator getTotalItems() {
|
||||
return totalItems;
|
||||
}
|
||||
|
||||
public void setTotalItems(LongAccumulator totalItems) {
|
||||
this.totalItems = totalItems;
|
||||
}
|
||||
|
||||
public LongAccumulator getErrorItems() {
|
||||
return errorItems;
|
||||
}
|
||||
|
||||
public void setErrorItems(LongAccumulator errorItems) {
|
||||
this.errorItems = errorItems;
|
||||
}
|
||||
|
||||
public LongAccumulator getProcessedItems() {
|
||||
return processedItems;
|
||||
}
|
||||
|
||||
public void setProcessedItems(LongAccumulator processedItems) {
|
||||
this.processedItems = processedItems;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
package eu.dnetlib.dhp.aggregation.common;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.message.MessageSender;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class);
|
||||
|
||||
private MessageSender messageSender;
|
||||
|
||||
public AggregatorReport() {
|
||||
}
|
||||
|
||||
public AggregatorReport(MessageSender messageSender) throws IOException {
|
||||
this.messageSender = messageSender;
|
||||
}
|
||||
|
||||
public void ongoing(Long current, Long total) {
|
||||
messageSender.sendMessage(current, total);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (Objects.nonNull(messageSender)) {
|
||||
log.info("closing report: ");
|
||||
this.forEach((k, v) -> log.info("{} - {}", k, v));
|
||||
|
||||
Map<String, String> m = new HashMap<>();
|
||||
m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values()));
|
||||
messageSender.sendReport(m);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.aggregation.common;
|
||||
|
||||
public interface ReporterCallback {
|
||||
|
||||
Long getCurrent();
|
||||
|
||||
Long getTotal();
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
|
||||
package eu.dnetlib.dhp.aggregation.common;
|
||||
|
||||
import java.util.TimerTask;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public abstract class ReportingJob {
|
||||
|
||||
/**
|
||||
* Frequency (seconds) for sending ongoing messages to report the collection task advancement
|
||||
*/
|
||||
public static final int ONGOING_REPORT_FREQUENCY = 5;
|
||||
|
||||
/**
|
||||
* Initial delay (seconds) for sending ongoing messages to report the collection task advancement
|
||||
*/
|
||||
public static final int INITIAL_DELAY = 2;
|
||||
|
||||
private final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
|
||||
|
||||
protected final AggregatorReport report;
|
||||
|
||||
public ReportingJob(AggregatorReport report) {
|
||||
this.report = report;
|
||||
}
|
||||
|
||||
protected void schedule(final ReporterCallback callback) {
|
||||
executor.scheduleAtFixedRate(new TimerTask() {
|
||||
@Override
|
||||
public void run() {
|
||||
report.ongoing(callback.getCurrent(), callback.getTotal());
|
||||
}
|
||||
}, INITIAL_DELAY, ONGOING_REPORT_FREQUENCY, TimeUnit.SECONDS);
|
||||
}
|
||||
|
||||
protected void shutdown() {
|
||||
executor.shutdown();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
|
||||
package eu.dnetlib.dhp.aggregation.mdstore;
|
||||
|
||||
import static eu.dnetlib.dhp.common.Constants.*;
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.*;
|
||||
|
||||
import java.net.URI;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.rest.DNetRestClient;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
|
||||
public class MDStoreActionNode {
|
||||
private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class);
|
||||
|
||||
enum MDAction {
|
||||
NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK
|
||||
}
|
||||
|
||||
public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion";
|
||||
|
||||
public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s";
|
||||
public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort";
|
||||
|
||||
public static final String READ_LOCK_URL = "%s/mdstore/%s/startReading";
|
||||
public static final String READ_UNLOCK_URL = "%s/version/%s/endReading";
|
||||
|
||||
private static final String MDSTOREVERSIONPARAM = "mdStoreVersion";
|
||||
private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
MDStoreActionNode.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/collection/mdstore_action_parameters.json")));
|
||||
argumentParser.parseArgument(args);
|
||||
|
||||
log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
|
||||
|
||||
final MDAction action = MDAction.valueOf(argumentParser.get("action"));
|
||||
log.info("Current action is {}", action);
|
||||
|
||||
final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI");
|
||||
log.info("mdStoreManagerURI is {}", mdStoreManagerURI);
|
||||
|
||||
switch (action) {
|
||||
case NEW_VERSION: {
|
||||
final String mdStoreID = argumentParser.get("mdStoreID");
|
||||
if (StringUtils.isBlank(mdStoreID)) {
|
||||
throw new IllegalArgumentException("missing or empty argument mdStoreId");
|
||||
}
|
||||
final MDStoreVersion currentVersion = DNetRestClient
|
||||
.doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
||||
populateOOZIEEnv(MDSTOREVERSIONPARAM, MAPPER.writeValueAsString(currentVersion));
|
||||
break;
|
||||
}
|
||||
case COMMIT: {
|
||||
|
||||
final String hdfsuri = argumentParser.get("namenode");
|
||||
if (StringUtils.isBlank(hdfsuri)) {
|
||||
throw new IllegalArgumentException("missing or empty argument namenode");
|
||||
}
|
||||
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
||||
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||
|
||||
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||
throw new IllegalArgumentException(
|
||||
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
|
||||
}
|
||||
Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
|
||||
|
||||
try (
|
||||
FileSystem fs = FileSystem.get(URI.create(hdfsuri), getHadoopConfiguration(hdfsuri));
|
||||
FSDataInputStream inputStream = fs.open(hdfstoreSizepath)) {
|
||||
|
||||
final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream));
|
||||
|
||||
fs.create(hdfstoreSizepath);
|
||||
DNetRestClient
|
||||
.doGET(
|
||||
String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case ROLLBACK: {
|
||||
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
||||
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||
|
||||
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||
throw new IllegalArgumentException(
|
||||
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
|
||||
}
|
||||
DNetRestClient.doGET(String.format(ROLLBACK_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId()));
|
||||
break;
|
||||
}
|
||||
|
||||
case READ_LOCK: {
|
||||
final String mdStoreID = argumentParser.get("mdStoreID");
|
||||
if (StringUtils.isBlank(mdStoreID)) {
|
||||
throw new IllegalArgumentException("missing or empty argument mdStoreId");
|
||||
}
|
||||
final MDStoreVersion currentVersion = DNetRestClient
|
||||
.doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
||||
populateOOZIEEnv(MDSTOREREADLOCKPARAM, MAPPER.writeValueAsString(currentVersion));
|
||||
break;
|
||||
}
|
||||
case READ_UNLOCK: {
|
||||
final String mdStoreVersion_params = argumentParser.get("readMDStoreId");
|
||||
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||
|
||||
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||
throw new IllegalArgumentException(
|
||||
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
|
||||
}
|
||||
DNetRestClient.doGET(String.format(READ_UNLOCK_URL, mdStoreManagerURI, mdStoreVersion.getId()));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid action");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,16 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.worker;
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
public class DnetCollectorException extends Exception {
|
||||
public class CollectorException extends Exception {
|
||||
|
||||
/** */
|
||||
private static final long serialVersionUID = -290723075076039757L;
|
||||
|
||||
public DnetCollectorException() {
|
||||
public CollectorException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public DnetCollectorException(
|
||||
public CollectorException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
|
@ -18,15 +18,15 @@ public class DnetCollectorException extends Exception {
|
|||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public DnetCollectorException(final String message, final Throwable cause) {
|
||||
public CollectorException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public DnetCollectorException(final String message) {
|
||||
public CollectorException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public DnetCollectorException(final Throwable cause) {
|
||||
public CollectorException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.DeflateCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
|
||||
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
|
||||
public class CollectorWorker extends ReportingJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class);
|
||||
|
||||
private final ApiDescriptor api;
|
||||
|
||||
private final FileSystem fileSystem;
|
||||
|
||||
private final MDStoreVersion mdStoreVersion;
|
||||
|
||||
private final HttpClientParams clientParams;
|
||||
|
||||
public CollectorWorker(
|
||||
final ApiDescriptor api,
|
||||
final FileSystem fileSystem,
|
||||
final MDStoreVersion mdStoreVersion,
|
||||
final HttpClientParams clientParams,
|
||||
final AggregatorReport report) {
|
||||
super(report);
|
||||
this.api = api;
|
||||
this.fileSystem = fileSystem;
|
||||
this.mdStoreVersion = mdStoreVersion;
|
||||
this.clientParams = clientParams;
|
||||
}
|
||||
|
||||
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
|
||||
|
||||
final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
||||
log.info("outputPath path is {}", outputPath);
|
||||
|
||||
final CollectorPlugin plugin = getCollectorPlugin();
|
||||
final AtomicInteger counter = new AtomicInteger(0);
|
||||
|
||||
scheduleReport(counter);
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
fileSystem.getConf(),
|
||||
SequenceFile.Writer.file(new Path(outputPath)),
|
||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class),
|
||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||
final IntWritable key = new IntWritable(counter.get());
|
||||
final Text value = new Text();
|
||||
plugin
|
||||
.collect(api, report)
|
||||
.forEach(
|
||||
content -> {
|
||||
key.set(counter.getAndIncrement());
|
||||
value.set(content);
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
} catch (Throwable e) {
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException(e);
|
||||
} finally {
|
||||
shutdown();
|
||||
report.ongoing(counter.longValue(), counter.longValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void scheduleReport(AtomicInteger counter) {
|
||||
schedule(new ReporterCallback() {
|
||||
@Override
|
||||
public Long getCurrent() {
|
||||
return counter.longValue();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getTotal() {
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
|
||||
|
||||
switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
|
||||
case oai:
|
||||
return new OaiCollectorPlugin(clientParams);
|
||||
case rest_json2xml:
|
||||
return new RestCollectorPlugin(clientParams);
|
||||
case other:
|
||||
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
|
||||
.ofNullable(api.getParams().get("other_plugin_type"))
|
||||
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
|
||||
.get();
|
||||
|
||||
switch (plugin) {
|
||||
case mdstore_mongodb_dump:
|
||||
return new MongoDbDumpCollectorPlugin(fileSystem);
|
||||
case mdstore_mongodb:
|
||||
return new MDStoreCollectorPlugin();
|
||||
default:
|
||||
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
|
||||
}
|
||||
default:
|
||||
throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,135 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
import static eu.dnetlib.dhp.common.Constants.*;
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.message.MessageSender;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
|
||||
/**
|
||||
* CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes
|
||||
* into HDFS. This application will be executed on the hadoop cluster, where invoked in the context of the metadata collection
|
||||
* oozie workflow, it will receive all the input parameters necessary to instantiate the specific collection plugin and the
|
||||
* relative specific configurations
|
||||
*
|
||||
* @author Sandro La Bruzzo, Claudio Atzori
|
||||
*/
|
||||
public class CollectorWorkerApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
|
||||
|
||||
private final FileSystem fileSystem;
|
||||
|
||||
public CollectorWorkerApplication(FileSystem fileSystem) {
|
||||
this.fileSystem = fileSystem;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param args
|
||||
*/
|
||||
public static void main(final String[] args)
|
||||
throws ParseException, IOException, UnknownCollectorPluginException, CollectorException {
|
||||
|
||||
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
CollectorWorkerApplication.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json")));
|
||||
argumentParser.parseArgument(args);
|
||||
|
||||
log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
|
||||
|
||||
final String hdfsuri = argumentParser.get("namenode");
|
||||
log.info("hdfsURI is {}", hdfsuri);
|
||||
|
||||
final String apiDescriptor = argumentParser.get("apidescriptor");
|
||||
log.info("apiDescriptor is {}", apiDescriptor);
|
||||
|
||||
final String mdStoreVersion = argumentParser.get("mdStoreVersion");
|
||||
log.info("mdStoreVersion is {}", mdStoreVersion);
|
||||
|
||||
final String dnetMessageManagerURL = argumentParser.get(DNET_MESSAGE_MGR_URL);
|
||||
log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
|
||||
|
||||
final String workflowId = argumentParser.get("workflowId");
|
||||
log.info("workflowId is {}", workflowId);
|
||||
|
||||
final HttpClientParams clientParams = getClientParams(argumentParser);
|
||||
|
||||
final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class);
|
||||
final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsuri));
|
||||
|
||||
new CollectorWorkerApplication(fileSystem)
|
||||
.run(mdStoreVersion, clientParams, api, dnetMessageManagerURL, workflowId);
|
||||
}
|
||||
|
||||
protected void run(String mdStoreVersion, HttpClientParams clientParams, ApiDescriptor api,
|
||||
String dnetMessageManagerURL, String workflowId)
|
||||
throws IOException, CollectorException, UnknownCollectorPluginException {
|
||||
|
||||
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
|
||||
final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId);
|
||||
|
||||
try (AggregatorReport report = new AggregatorReport(ms)) {
|
||||
new CollectorWorker(api, fileSystem, currentVersion, clientParams, report).collect();
|
||||
}
|
||||
}
|
||||
|
||||
private static HttpClientParams getClientParams(ArgumentApplicationParser argumentParser) {
|
||||
final HttpClientParams clientParams = new HttpClientParams();
|
||||
clientParams
|
||||
.setMaxNumberOfRetry(
|
||||
Optional
|
||||
.ofNullable(argumentParser.get(MAX_NUMBER_OF_RETRY))
|
||||
.map(Integer::parseInt)
|
||||
.orElse(HttpClientParams._maxNumberOfRetry));
|
||||
log.info("maxNumberOfRetry is {}", clientParams.getMaxNumberOfRetry());
|
||||
|
||||
clientParams
|
||||
.setRequestDelay(
|
||||
Optional
|
||||
.ofNullable(argumentParser.get(REQUEST_DELAY))
|
||||
.map(Integer::parseInt)
|
||||
.orElse(HttpClientParams._requestDelay));
|
||||
log.info("requestDelay is {}", clientParams.getRequestDelay());
|
||||
|
||||
clientParams
|
||||
.setRetryDelay(
|
||||
Optional
|
||||
.ofNullable(argumentParser.get(RETRY_DELAY))
|
||||
.map(Integer::parseInt)
|
||||
.orElse(HttpClientParams._retryDelay));
|
||||
log.info("retryDelay is {}", clientParams.getRetryDelay());
|
||||
|
||||
clientParams
|
||||
.setConnectTimeOut(
|
||||
Optional
|
||||
.ofNullable(argumentParser.get(CONNECT_TIMEOUT))
|
||||
.map(Integer::parseInt)
|
||||
.orElse(HttpClientParams._connectTimeOut));
|
||||
log.info("connectTimeOut is {}", clientParams.getConnectTimeOut());
|
||||
|
||||
clientParams
|
||||
.setReadTimeOut(
|
||||
Optional
|
||||
.ofNullable(argumentParser.get(READ_TIMEOUT))
|
||||
.map(Integer::parseInt)
|
||||
.orElse(HttpClientParams._readTimeOut));
|
||||
log.info("readTimeOut is {}", clientParams.getReadTimeOut());
|
||||
return clientParams;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,28 +1,26 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
import static eu.dnetlib.dhp.common.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.*;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.cli.*;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
@ -30,19 +28,172 @@ import org.dom4j.io.SAXReader;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.model.mdstore.Provenance;
|
||||
import eu.dnetlib.message.Message;
|
||||
import eu.dnetlib.message.MessageManager;
|
||||
import eu.dnetlib.message.MessageType;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.schema.mdstore.Provenance;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateNativeStoreSparkJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenerateNativeStoreSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/collection/generate_native_input_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String provenanceArgument = parser.get("provenance");
|
||||
log.info("Provenance is {}", provenanceArgument);
|
||||
final Provenance provenance = MAPPER.readValue(provenanceArgument, Provenance.class);
|
||||
|
||||
final String dateOfCollectionArgs = parser.get("dateOfCollection");
|
||||
log.info("dateOfCollection is {}", dateOfCollectionArgs);
|
||||
final Long dateOfCollection = new Long(dateOfCollectionArgs);
|
||||
|
||||
String mdStoreVersion = parser.get("mdStoreVersion");
|
||||
log.info("mdStoreVersion is {}", mdStoreVersion);
|
||||
|
||||
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
|
||||
|
||||
String readMdStoreVersionParam = parser.get("readMdStoreVersion");
|
||||
log.info("readMdStoreVersion is {}", readMdStoreVersionParam);
|
||||
|
||||
final MDStoreVersion readMdStoreVersion = StringUtils.isBlank(readMdStoreVersionParam) ? null
|
||||
: MAPPER.readValue(readMdStoreVersionParam, MDStoreVersion.class);
|
||||
|
||||
final String xpath = parser.get("xpath");
|
||||
log.info("xpath is {}", xpath);
|
||||
|
||||
final String encoding = parser.get("encoding");
|
||||
log.info("encoding is {}", encoding);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> createNativeMDStore(
|
||||
spark, provenance, dateOfCollection, xpath, encoding, currentVersion, readMdStoreVersion));
|
||||
}
|
||||
|
||||
private static void createNativeMDStore(SparkSession spark,
|
||||
Provenance provenance,
|
||||
Long dateOfCollection,
|
||||
String xpath,
|
||||
String encoding,
|
||||
MDStoreVersion currentVersion,
|
||||
MDStoreVersion readVersion) throws IOException {
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
final LongAccumulator totalItems = sc.sc().longAccumulator(CONTENT_TOTALITEMS);
|
||||
final LongAccumulator invalidRecords = sc.sc().longAccumulator(CONTENT_INVALIDRECORDS);
|
||||
|
||||
final String seqFilePath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
||||
final JavaRDD<MetadataRecord> nativeStore = sc
|
||||
.sequenceFile(seqFilePath, IntWritable.class, Text.class)
|
||||
.map(
|
||||
item -> parseRecord(
|
||||
item._2().toString(),
|
||||
xpath,
|
||||
encoding,
|
||||
provenance,
|
||||
dateOfCollection,
|
||||
totalItems,
|
||||
invalidRecords))
|
||||
.filter(Objects::nonNull)
|
||||
.distinct();
|
||||
|
||||
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
|
||||
final Dataset<MetadataRecord> mdstore = spark.createDataset(nativeStore.rdd(), encoder);
|
||||
|
||||
final String targetPath = currentVersion.getHdfsPath() + MDSTORE_DATA_PATH;
|
||||
|
||||
if (readVersion != null) { // INCREMENTAL MODE
|
||||
log.info("updating {} incrementally with {}", targetPath, readVersion.getHdfsPath());
|
||||
Dataset<MetadataRecord> currentMdStoreVersion = spark
|
||||
.read()
|
||||
.load(readVersion.getHdfsPath() + MDSTORE_DATA_PATH)
|
||||
.as(encoder);
|
||||
TypedColumn<MetadataRecord, MetadataRecord> aggregator = new MDStoreAggregator().toColumn();
|
||||
|
||||
final Dataset<MetadataRecord> map = currentMdStoreVersion
|
||||
.union(mdstore)
|
||||
.groupByKey(
|
||||
(MapFunction<MetadataRecord, String>) MetadataRecord::getId,
|
||||
Encoders.STRING())
|
||||
.agg(aggregator)
|
||||
.map((MapFunction<Tuple2<String, MetadataRecord>, MetadataRecord>) Tuple2::_2, encoder);
|
||||
|
||||
map.select("id").takeAsList(100).forEach(s -> log.info(s.toString()));
|
||||
|
||||
saveDataset(map, targetPath);
|
||||
|
||||
} else {
|
||||
saveDataset(mdstore, targetPath);
|
||||
}
|
||||
|
||||
final Long total = spark.read().load(targetPath).count();
|
||||
log.info("collected {} records for datasource '{}'", total, provenance.getDatasourceName());
|
||||
|
||||
writeHdfsFile(
|
||||
spark.sparkContext().hadoopConfiguration(), total.toString(),
|
||||
currentVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
|
||||
}
|
||||
|
||||
public static class MDStoreAggregator extends Aggregator<MetadataRecord, MetadataRecord, MetadataRecord> {
|
||||
|
||||
@Override
|
||||
public MetadataRecord zero() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MetadataRecord reduce(MetadataRecord b, MetadataRecord a) {
|
||||
return getLatestRecord(b, a);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MetadataRecord merge(MetadataRecord b, MetadataRecord a) {
|
||||
return getLatestRecord(b, a);
|
||||
}
|
||||
|
||||
private MetadataRecord getLatestRecord(MetadataRecord b, MetadataRecord a) {
|
||||
if (b == null)
|
||||
return a;
|
||||
|
||||
if (a == null)
|
||||
return b;
|
||||
return (a.getDateOfCollection() > b.getDateOfCollection()) ? a : b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MetadataRecord finish(MetadataRecord r) {
|
||||
return r;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<MetadataRecord> bufferEncoder() {
|
||||
return Encoders.bean(MetadataRecord.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<MetadataRecord> outputEncoder() {
|
||||
return Encoders.bean(MetadataRecord.class);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static MetadataRecord parseRecord(
|
||||
final String input,
|
||||
final String xpath,
|
||||
|
@ -64,112 +215,11 @@ public class GenerateNativeStoreSparkJob {
|
|||
invalidRecords.add(1);
|
||||
return null;
|
||||
}
|
||||
return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection);
|
||||
return new MetadataRecord(originalIdentifier, encoding, provenance, document.asXML(), dateOfCollection);
|
||||
} catch (Throwable e) {
|
||||
if (invalidRecords != null)
|
||||
invalidRecords.add(1);
|
||||
e.printStackTrace();
|
||||
invalidRecords.add(1);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenerateNativeStoreSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/collection/collection_input_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
||||
final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class);
|
||||
final long dateOfCollection = new Long(parser.get("dateOfCollection"));
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final Map<String, String> ongoingMap = new HashMap<>();
|
||||
final Map<String, String> reportMap = new HashMap<>();
|
||||
|
||||
final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
final JavaPairRDD<IntWritable, Text> inputRDD = sc
|
||||
.sequenceFile(parser.get("input"), IntWritable.class, Text.class);
|
||||
|
||||
final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems");
|
||||
final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords");
|
||||
|
||||
final MessageManager manager = new MessageManager(
|
||||
parser.get("rabbitHost"),
|
||||
parser.get("rabbitUser"),
|
||||
parser.get("rabbitPassword"),
|
||||
false,
|
||||
false,
|
||||
null);
|
||||
|
||||
final JavaRDD<MetadataRecord> mappeRDD = inputRDD
|
||||
.map(
|
||||
item -> parseRecord(
|
||||
item._2().toString(),
|
||||
parser.get("xpath"),
|
||||
parser.get("encoding"),
|
||||
provenance,
|
||||
dateOfCollection,
|
||||
totalItems,
|
||||
invalidRecords))
|
||||
.filter(Objects::nonNull)
|
||||
.distinct();
|
||||
|
||||
ongoingMap.put("ongoing", "0");
|
||||
if (!test) {
|
||||
manager
|
||||
.sendMessage(
|
||||
new Message(
|
||||
parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
|
||||
parser.get("rabbitOngoingQueue"),
|
||||
true,
|
||||
false);
|
||||
}
|
||||
|
||||
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
|
||||
final Dataset<MetadataRecord> mdstore = spark.createDataset(mappeRDD.rdd(), encoder);
|
||||
final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords");
|
||||
mdStoreRecords.add(mdstore.count());
|
||||
ongoingMap.put("ongoing", "" + totalItems.value());
|
||||
if (!test) {
|
||||
manager
|
||||
.sendMessage(
|
||||
new Message(
|
||||
parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
|
||||
parser.get("rabbitOngoingQueue"),
|
||||
true,
|
||||
false);
|
||||
}
|
||||
mdstore.write().format("parquet").save(parser.get("output"));
|
||||
reportMap.put("inputItem", "" + totalItems.value());
|
||||
reportMap.put("invalidRecords", "" + invalidRecords.value());
|
||||
reportMap.put("mdStoreSize", "" + mdStoreRecords.value());
|
||||
if (!test) {
|
||||
manager
|
||||
.sendMessage(
|
||||
new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
|
||||
parser.get("rabbitReportQueue"),
|
||||
true,
|
||||
false);
|
||||
manager.close();
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
/**
|
||||
* Bundles the http connection parameters driving the client behaviour.
|
||||
*/
|
||||
public class HttpClientParams {
|
||||
|
||||
// Defaults
|
||||
public static int _maxNumberOfRetry = 3;
|
||||
public static int _requestDelay = 0; // milliseconds
|
||||
public static int _retryDelay = 10; // seconds
|
||||
public static int _connectTimeOut = 10; // seconds
|
||||
public static int _readTimeOut = 30; // seconds
|
||||
|
||||
/**
|
||||
* Maximum number of allowed retires before failing
|
||||
*/
|
||||
private int maxNumberOfRetry;
|
||||
|
||||
/**
|
||||
* Delay between request (Milliseconds)
|
||||
*/
|
||||
private int requestDelay;
|
||||
|
||||
/**
|
||||
* Time to wait after a failure before retrying (Seconds)
|
||||
*/
|
||||
private int retryDelay;
|
||||
|
||||
/**
|
||||
* Connect timeout (Seconds)
|
||||
*/
|
||||
private int connectTimeOut;
|
||||
|
||||
/**
|
||||
* Read timeout (Seconds)
|
||||
*/
|
||||
private int readTimeOut;
|
||||
|
||||
public HttpClientParams() {
|
||||
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut);
|
||||
}
|
||||
|
||||
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
|
||||
int readTimeOut) {
|
||||
this.maxNumberOfRetry = maxNumberOfRetry;
|
||||
this.requestDelay = requestDelay;
|
||||
this.retryDelay = retryDelay;
|
||||
this.connectTimeOut = connectTimeOut;
|
||||
this.readTimeOut = readTimeOut;
|
||||
}
|
||||
|
||||
public int getMaxNumberOfRetry() {
|
||||
return maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public void setMaxNumberOfRetry(int maxNumberOfRetry) {
|
||||
this.maxNumberOfRetry = maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public int getRequestDelay() {
|
||||
return requestDelay;
|
||||
}
|
||||
|
||||
public void setRequestDelay(int requestDelay) {
|
||||
this.requestDelay = requestDelay;
|
||||
}
|
||||
|
||||
public int getRetryDelay() {
|
||||
return retryDelay;
|
||||
}
|
||||
|
||||
public void setRetryDelay(int retryDelay) {
|
||||
this.retryDelay = retryDelay;
|
||||
}
|
||||
|
||||
public void setConnectTimeOut(int connectTimeOut) {
|
||||
this.connectTimeOut = connectTimeOut;
|
||||
}
|
||||
|
||||
public int getConnectTimeOut() {
|
||||
return connectTimeOut;
|
||||
}
|
||||
|
||||
public int getReadTimeOut() {
|
||||
return readTimeOut;
|
||||
}
|
||||
|
||||
public void setReadTimeOut(int readTimeOut) {
|
||||
this.readTimeOut = readTimeOut;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,259 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.*;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.http.HttpHeaders;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
|
||||
/**
|
||||
* Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java
|
||||
*
|
||||
* @author jochen, michele, andrea, alessia, claudio
|
||||
*/
|
||||
public class HttpConnector2 {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class);
|
||||
|
||||
private static final String REPORT_PREFIX = "http:";
|
||||
|
||||
private HttpClientParams clientParams;
|
||||
|
||||
private String responseType = null;
|
||||
|
||||
private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
||||
|
||||
public HttpConnector2() {
|
||||
this(new HttpClientParams());
|
||||
}
|
||||
|
||||
public HttpConnector2(HttpClientParams clientParams) {
|
||||
this.clientParams = clientParams;
|
||||
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
||||
}
|
||||
|
||||
/**
|
||||
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
|
||||
*/
|
||||
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
|
||||
return IOUtils.toInputStream(getInputSource(requestUrl));
|
||||
}
|
||||
|
||||
/**
|
||||
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
|
||||
*/
|
||||
public String getInputSource(final String requestUrl) throws CollectorException {
|
||||
return attemptDownloadAsString(requestUrl, 1, new AggregatorReport());
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content via HTTP GET
|
||||
*
|
||||
* @param requestUrl the URL
|
||||
* @param report the list of errors
|
||||
* @return the content of the downloaded resource
|
||||
* @throws CollectorException when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public String getInputSource(final String requestUrl, AggregatorReport report)
|
||||
throws CollectorException {
|
||||
return attemptDownloadAsString(requestUrl, 1, report);
|
||||
}
|
||||
|
||||
private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
|
||||
final AggregatorReport report) throws CollectorException {
|
||||
|
||||
try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
|
||||
return IOUtils.toString(s);
|
||||
} catch (IOException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
|
||||
final AggregatorReport report) throws CollectorException, IOException {
|
||||
|
||||
if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
|
||||
final String msg = String
|
||||
.format(
|
||||
"Max number of retries (%s/%s) exceeded, failing.",
|
||||
retryNumber, getClientParams().getMaxNumberOfRetry());
|
||||
log.error(msg);
|
||||
throw new CollectorException(msg);
|
||||
}
|
||||
|
||||
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
|
||||
|
||||
InputStream input = null;
|
||||
|
||||
try {
|
||||
if (getClientParams().getRequestDelay() > 0) {
|
||||
backoffAndSleep(getClientParams().getRequestDelay());
|
||||
}
|
||||
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
||||
urlConn.setInstanceFollowRedirects(false);
|
||||
urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
|
||||
urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
|
||||
urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
|
||||
|
||||
if (log.isDebugEnabled()) {
|
||||
logHeaderFields(urlConn);
|
||||
}
|
||||
|
||||
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
||||
if (is2xx(urlConn.getResponseCode())) {
|
||||
input = urlConn.getInputStream();
|
||||
responseType = urlConn.getContentType();
|
||||
return input;
|
||||
}
|
||||
if (is3xx(urlConn.getResponseCode())) {
|
||||
// REDIRECTS
|
||||
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
||||
log.info(String.format("The requested url has been moved to %s", newUrl));
|
||||
report
|
||||
.put(
|
||||
REPORT_PREFIX + urlConn.getResponseCode(),
|
||||
String.format("Moved to: %s", newUrl));
|
||||
urlConn.disconnect();
|
||||
if (retryAfter > 0) {
|
||||
backoffAndSleep(retryAfter);
|
||||
}
|
||||
return attemptDownload(newUrl, retryNumber + 1, report);
|
||||
}
|
||||
if (is4xx(urlConn.getResponseCode())) {
|
||||
// CLIENT ERROR, DO NOT RETRY
|
||||
report
|
||||
.put(
|
||||
REPORT_PREFIX + urlConn.getResponseCode(),
|
||||
String
|
||||
.format(
|
||||
"%s error: %s", requestUrl, urlConn.getResponseMessage()));
|
||||
throw new CollectorException("4xx error: request will not be repeated. " + report);
|
||||
}
|
||||
if (is5xx(urlConn.getResponseCode())) {
|
||||
// SERVER SIDE ERRORS RETRY ONLY on 503
|
||||
switch (urlConn.getResponseCode()) {
|
||||
case HttpURLConnection.HTTP_UNAVAILABLE:
|
||||
if (retryAfter > 0) {
|
||||
log
|
||||
.warn(
|
||||
requestUrl + " - waiting and repeating request after suggested retry-after "
|
||||
+ retryAfter + " sec.");
|
||||
backoffAndSleep(retryAfter * 1000);
|
||||
} else {
|
||||
log
|
||||
.warn(
|
||||
requestUrl + " - waiting and repeating request after default delay of "
|
||||
+ getClientParams().getRetryDelay() + " sec.");
|
||||
backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000);
|
||||
}
|
||||
report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, report);
|
||||
default:
|
||||
report
|
||||
.put(
|
||||
REPORT_PREFIX + urlConn.getResponseCode(),
|
||||
String
|
||||
.format(
|
||||
"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
|
||||
throw new CollectorException(urlConn.getResponseCode() + " error " + report);
|
||||
}
|
||||
}
|
||||
throw new CollectorException(
|
||||
String
|
||||
.format(
|
||||
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
||||
MAPPER.writeValueAsString(report)));
|
||||
} catch (MalformedURLException | UnknownHostException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException(e.getMessage(), e);
|
||||
} catch (SocketTimeoutException | SocketException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
||||
return attemptDownload(requestUrl, retryNumber + 1, report);
|
||||
}
|
||||
}
|
||||
|
||||
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
||||
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
||||
|
||||
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
||||
if (e.getKey() != null) {
|
||||
for (String v : e.getValue()) {
|
||||
log.debug(" key: " + e.getKey() + " - value: " + v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void backoffAndSleep(int sleepTimeMs) throws CollectorException {
|
||||
log.info("I'm going to sleep for {}ms", sleepTimeMs);
|
||||
try {
|
||||
Thread.sleep(sleepTimeMs);
|
||||
} catch (InterruptedException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
||||
for (String key : headerMap.keySet()) {
|
||||
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (headerMap.get(key).size() > 0)
|
||||
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
||||
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
|
||||
for (String key : headerMap.keySet()) {
|
||||
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
|
||||
return headerMap.get(key).get(0);
|
||||
}
|
||||
}
|
||||
throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
|
||||
}
|
||||
|
||||
private boolean is2xx(final int statusCode) {
|
||||
return statusCode >= 200 && statusCode <= 299;
|
||||
}
|
||||
|
||||
private boolean is4xx(final int statusCode) {
|
||||
return statusCode >= 400 && statusCode <= 499;
|
||||
}
|
||||
|
||||
private boolean is3xx(final int statusCode) {
|
||||
return statusCode >= 300 && statusCode <= 399;
|
||||
}
|
||||
|
||||
private boolean is5xx(final int statusCode) {
|
||||
return statusCode >= 500 && statusCode <= 599;
|
||||
}
|
||||
|
||||
public String getResponseType() {
|
||||
return responseType;
|
||||
}
|
||||
|
||||
public HttpClientParams getClientParams() {
|
||||
return clientParams;
|
||||
}
|
||||
|
||||
public void setClientParams(HttpClientParams clientParams) {
|
||||
this.clientParams = clientParams;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
public class JsonUtils {
|
||||
|
||||
private static final Log log = LogFactory.getLog(JsonUtils.class);
|
||||
|
||||
public static final String wrapName = "recordWrap";
|
||||
|
||||
/**
|
||||
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
|
||||
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
|
||||
* and work-around for the JSON to XML converting of org.json.XML-package.
|
||||
*
|
||||
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"],
|
||||
*
|
||||
* @param jsonInput
|
||||
* @return convertedJsonKeynameOutput
|
||||
*/
|
||||
public String syntaxConvertJsonKeyNames(String jsonInput) {
|
||||
|
||||
log.trace("before convertJsonKeyNames: " + jsonInput);
|
||||
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
|
||||
// replace ' 's in JSON Namens with '_'
|
||||
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
|
||||
}
|
||||
|
||||
// replace forward-slash (sign '/' ) in JSON Names with '_'
|
||||
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
|
||||
}
|
||||
|
||||
// replace '(' in JSON Names with ''
|
||||
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
|
||||
}
|
||||
|
||||
// replace ')' in JSON Names with ''
|
||||
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
|
||||
}
|
||||
|
||||
// add prefix of startNumbers in JSON Keynames with 'n_'
|
||||
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
|
||||
}
|
||||
// add prefix of only numbers in JSON Keynames with 'm_'
|
||||
while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
|
||||
}
|
||||
|
||||
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
|
||||
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
|
||||
}
|
||||
|
||||
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
|
||||
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
|
||||
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
|
||||
// }
|
||||
|
||||
// replace '=' in JSON Keynames with '-'
|
||||
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
|
||||
}
|
||||
|
||||
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
|
||||
return jsonInput;
|
||||
}
|
||||
|
||||
public String convertToXML(final String jsonRecord) {
|
||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||
org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord));
|
||||
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
|
||||
log.trace("before inputStream: " + resultXml);
|
||||
resultXml = XmlCleaner.cleanAllEntities(resultXml);
|
||||
log.trace("after cleaning: " + resultXml);
|
||||
return resultXml;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
public class UnknownCollectorPluginException extends Exception {
|
||||
|
||||
/** */
|
||||
private static final long serialVersionUID = -290723075076039757L;
|
||||
|
||||
public UnknownCollectorPluginException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public UnknownCollectorPluginException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public UnknownCollectorPluginException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public UnknownCollectorPluginException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public UnknownCollectorPluginException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.worker.utils;
|
||||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
|
@ -3,10 +3,21 @@ package eu.dnetlib.dhp.collection.plugin;
|
|||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.CollectorException;
|
||||
|
||||
public interface CollectorPlugin {
|
||||
|
||||
Stream<String> collect(ApiDescriptor api) throws DnetCollectorException;
|
||||
enum NAME {
|
||||
oai, other, rest_json2xml;
|
||||
|
||||
public enum OTHER_NAME {
|
||||
mdstore_mongodb_dump, mdstore_mongodb
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException;
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.mongodb;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.bson.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.mongodb.MongoClient;
|
||||
import com.mongodb.MongoClientURI;
|
||||
import com.mongodb.client.MongoCollection;
|
||||
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.common.MdstoreClient;
|
||||
|
||||
public class MDStoreCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MDStoreCollectorPlugin.class);
|
||||
|
||||
public static final String MONGODB_DBNAME = "mongodb_dbname";
|
||||
public static final String MDSTORE_ID = "mdstore_id";
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
|
||||
|
||||
final String mongoBaseUrl = Optional
|
||||
.ofNullable(api.getBaseUrl())
|
||||
.orElseThrow(
|
||||
() -> new CollectorException(
|
||||
"missing mongodb baseUrl, expected in eu.dnetlib.dhp.collection.ApiDescriptor.baseUrl"));
|
||||
log.info("mongoBaseUrl: {}", mongoBaseUrl);
|
||||
|
||||
final String dbName = Optional
|
||||
.ofNullable(api.getParams().get(MONGODB_DBNAME))
|
||||
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_DBNAME)));
|
||||
log.info("dbName: {}", dbName);
|
||||
|
||||
final String mdId = Optional
|
||||
.ofNullable(api.getParams().get(MDSTORE_ID))
|
||||
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MDSTORE_ID)));
|
||||
log.info("mdId: {}", mdId);
|
||||
|
||||
final MdstoreClient client = new MdstoreClient(mongoBaseUrl, dbName);
|
||||
final MongoCollection<Document> mdstore = client.mdStore(mdId);
|
||||
long size = mdstore.count();
|
||||
|
||||
return StreamSupport
|
||||
.stream(
|
||||
Spliterators.spliterator(mdstore.find().iterator(), size, Spliterator.SIZED), false)
|
||||
.map(doc -> doc.getString("body"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.mongodb;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class MongoDbDumpCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
public static final String PATH_PARAM = "path";
|
||||
public static final String BODY_JSONPATH = "$.body";
|
||||
|
||||
public FileSystem fileSystem;
|
||||
|
||||
public MongoDbDumpCollectorPlugin(FileSystem fileSystem) {
|
||||
this.fileSystem = fileSystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
|
||||
|
||||
final Path path = Optional
|
||||
.ofNullable(api.getParams().get("path"))
|
||||
.map(Path::new)
|
||||
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", PATH_PARAM)));
|
||||
|
||||
try {
|
||||
if (!fileSystem.exists(path)) {
|
||||
throw new CollectorException("path does not exist: " + path.toString());
|
||||
}
|
||||
|
||||
return new BufferedReader(
|
||||
new InputStreamReader(new GZIPInputStream(fileSystem.open(path)), Charset.defaultCharset()))
|
||||
.lines()
|
||||
.map(s -> DHPUtils.getJPathString(BODY_JSONPATH, s));
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -13,9 +13,11 @@ import com.google.common.base.Splitter;
|
|||
import com.google.common.collect.Iterators;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
||||
|
||||
public class OaiCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
|
@ -26,8 +28,15 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
|||
|
||||
private OaiIteratorFactory oaiIteratorFactory;
|
||||
|
||||
private HttpClientParams clientParams;
|
||||
|
||||
public OaiCollectorPlugin(HttpClientParams clientParams) {
|
||||
this.clientParams = clientParams;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(final ApiDescriptor api) throws DnetCollectorException {
|
||||
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
|
||||
throws CollectorException {
|
||||
final String baseUrl = api.getBaseUrl();
|
||||
final String mdFormat = api.getParams().get(FORMAT_PARAM);
|
||||
final String setParam = api.getParams().get(OAI_SET_PARAM);
|
||||
|
@ -46,26 +55,26 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
|||
}
|
||||
|
||||
if (baseUrl == null || baseUrl.isEmpty()) {
|
||||
throw new DnetCollectorException("Param 'baseurl' is null or empty");
|
||||
throw new CollectorException("Param 'baseurl' is null or empty");
|
||||
}
|
||||
|
||||
if (mdFormat == null || mdFormat.isEmpty()) {
|
||||
throw new DnetCollectorException("Param 'mdFormat' is null or empty");
|
||||
throw new CollectorException("Param 'mdFormat' is null or empty");
|
||||
}
|
||||
|
||||
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
|
||||
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
|
||||
}
|
||||
|
||||
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
|
||||
throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
|
||||
}
|
||||
|
||||
final Iterator<Iterator<String>> iters = sets
|
||||
.stream()
|
||||
.map(
|
||||
set -> getOaiIteratorFactory()
|
||||
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
|
||||
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate, getClientParams(), report))
|
||||
.iterator();
|
||||
|
||||
return StreamSupport
|
||||
|
@ -79,4 +88,12 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
|||
}
|
||||
return oaiIteratorFactory;
|
||||
}
|
||||
|
||||
public HttpClientParams getClientParams() {
|
||||
return clientParams;
|
||||
}
|
||||
|
||||
public void setClientParams(HttpClientParams clientParams) {
|
||||
this.clientParams = clientParams;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.oai;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLEncoder;
|
||||
import java.util.Iterator;
|
||||
|
@ -9,24 +11,28 @@ import java.util.Queue;
|
|||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.OutputFormat;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.dom4j.io.XMLWriter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
||||
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
|
||||
import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||
import eu.dnetlib.dhp.collection.XmlCleaner;
|
||||
|
||||
public class OaiIterator implements Iterator<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on
|
||||
// 11/24/08 5:02 PM
|
||||
private static final Logger log = LoggerFactory.getLogger(OaiIterator.class);
|
||||
|
||||
private final static String REPORT_PREFIX = "oai:";
|
||||
|
||||
private final Queue<String> queue = new PriorityBlockingQueue<>();
|
||||
private final SAXReader reader = new SAXReader();
|
||||
|
||||
private final String baseUrl;
|
||||
private final String set;
|
||||
|
@ -35,7 +41,8 @@ public class OaiIterator implements Iterator<String> {
|
|||
private final String untilDate;
|
||||
private String token;
|
||||
private boolean started;
|
||||
private final HttpConnector httpConnector;
|
||||
private final HttpConnector2 httpConnector;
|
||||
private final AggregatorReport report;
|
||||
|
||||
public OaiIterator(
|
||||
final String baseUrl,
|
||||
|
@ -43,7 +50,8 @@ public class OaiIterator implements Iterator<String> {
|
|||
final String set,
|
||||
final String fromDate,
|
||||
final String untilDate,
|
||||
final HttpConnector httpConnector) {
|
||||
final HttpConnector2 httpConnector,
|
||||
final AggregatorReport report) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.mdFormat = mdFormat;
|
||||
this.set = set;
|
||||
|
@ -51,6 +59,7 @@ public class OaiIterator implements Iterator<String> {
|
|||
this.untilDate = untilDate;
|
||||
this.started = false;
|
||||
this.httpConnector = httpConnector;
|
||||
this.report = report;
|
||||
}
|
||||
|
||||
private void verifyStarted() {
|
||||
|
@ -58,7 +67,7 @@ public class OaiIterator implements Iterator<String> {
|
|||
this.started = true;
|
||||
try {
|
||||
this.token = firstPage();
|
||||
} catch (final DnetCollectorException e) {
|
||||
} catch (final CollectorException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
@ -80,7 +89,7 @@ public class OaiIterator implements Iterator<String> {
|
|||
while (queue.isEmpty() && token != null && !token.isEmpty()) {
|
||||
try {
|
||||
token = otherPages(token);
|
||||
} catch (final DnetCollectorException e) {
|
||||
} catch (final CollectorException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
@ -92,7 +101,7 @@ public class OaiIterator implements Iterator<String> {
|
|||
public void remove() {
|
||||
}
|
||||
|
||||
private String firstPage() throws DnetCollectorException {
|
||||
private String firstPage() throws CollectorException {
|
||||
try {
|
||||
String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8");
|
||||
if (set != null && !set.isEmpty()) {
|
||||
|
@ -108,7 +117,8 @@ public class OaiIterator implements Iterator<String> {
|
|||
|
||||
return downloadPage(url);
|
||||
} catch (final UnsupportedEncodingException e) {
|
||||
throw new DnetCollectorException(e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -126,32 +136,35 @@ public class OaiIterator implements Iterator<String> {
|
|||
return result.trim();
|
||||
}
|
||||
|
||||
private String otherPages(final String resumptionToken) throws DnetCollectorException {
|
||||
private String otherPages(final String resumptionToken) throws CollectorException {
|
||||
try {
|
||||
return downloadPage(
|
||||
baseUrl
|
||||
+ "?verb=ListRecords&resumptionToken="
|
||||
+ URLEncoder.encode(resumptionToken, "UTF-8"));
|
||||
} catch (final UnsupportedEncodingException e) {
|
||||
throw new DnetCollectorException(e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String downloadPage(final String url) throws DnetCollectorException {
|
||||
private String downloadPage(final String url) throws CollectorException {
|
||||
|
||||
final String xml = httpConnector.getInputSource(url);
|
||||
final String xml = httpConnector.getInputSource(url, report);
|
||||
Document doc;
|
||||
try {
|
||||
doc = reader.read(new StringReader(xml));
|
||||
doc = DocumentHelper.parseText(xml);
|
||||
} catch (final DocumentException e) {
|
||||
log.warn("Error parsing xml, I try to clean it: " + xml, e);
|
||||
log.warn("Error parsing xml, I try to clean it. {}", e.getMessage());
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
final String cleaned = XmlCleaner.cleanAllEntities(xml);
|
||||
try {
|
||||
doc = reader.read(new StringReader(cleaned));
|
||||
doc = DocumentHelper.parseText(xml);
|
||||
} catch (final DocumentException e1) {
|
||||
final String resumptionToken = extractResumptionToken(xml);
|
||||
if (resumptionToken == null) {
|
||||
throw new DnetCollectorException("Error parsing cleaned document:" + cleaned, e1);
|
||||
report.put(e1.getClass().getName(), e1.getMessage());
|
||||
throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1);
|
||||
}
|
||||
return resumptionToken;
|
||||
}
|
||||
|
@ -159,19 +172,35 @@ public class OaiIterator implements Iterator<String> {
|
|||
|
||||
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
|
||||
if (errorNode != null) {
|
||||
final String code = errorNode.valueOf("@code");
|
||||
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
|
||||
log.warn("noRecordsMatch for oai call: " + url);
|
||||
final String code = errorNode.valueOf("@code").trim();
|
||||
if ("noRecordsMatch".equalsIgnoreCase(code)) {
|
||||
final String msg = "noRecordsMatch for oai call : " + url;
|
||||
log.warn(msg);
|
||||
report.put(REPORT_PREFIX + code, msg);
|
||||
return null;
|
||||
} else {
|
||||
throw new DnetCollectorException(code + " - " + errorNode.getText());
|
||||
final String msg = code + " - " + errorNode.getText();
|
||||
report.put(REPORT_PREFIX + "error", msg);
|
||||
throw new CollectorException(msg);
|
||||
}
|
||||
}
|
||||
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
|
||||
queue.add(((Node) o).asXML());
|
||||
final StringWriter sw = new StringWriter();
|
||||
final XMLWriter writer = new XMLWriter(sw, OutputFormat.createPrettyPrint());
|
||||
try {
|
||||
writer.write((Node) o);
|
||||
queue.add(sw.toString());
|
||||
} catch (IOException e) {
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException("Error parsing XML record:\n" + ((Node) o).asXML(), e);
|
||||
}
|
||||
}
|
||||
|
||||
return doc.valueOf("//*[local-name()='resumptionToken']");
|
||||
}
|
||||
|
||||
public AggregatorReport getReport() {
|
||||
return report;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,24 +3,28 @@ package eu.dnetlib.dhp.collection.plugin.oai;
|
|||
|
||||
import java.util.Iterator;
|
||||
|
||||
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||
|
||||
public class OaiIteratorFactory {
|
||||
|
||||
private HttpConnector httpConnector;
|
||||
private HttpConnector2 httpConnector;
|
||||
|
||||
public Iterator<String> newIterator(
|
||||
final String baseUrl,
|
||||
final String mdFormat,
|
||||
final String set,
|
||||
final String fromDate,
|
||||
final String untilDate) {
|
||||
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector());
|
||||
final String untilDate,
|
||||
final HttpClientParams clientParams,
|
||||
final AggregatorReport report) {
|
||||
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), report);
|
||||
}
|
||||
|
||||
private HttpConnector getHttpConnector() {
|
||||
private HttpConnector2 getHttpConnector(HttpClientParams clientParams) {
|
||||
if (httpConnector == null)
|
||||
httpConnector = new HttpConnector();
|
||||
httpConnector = new HttpConnector2(clientParams);
|
||||
return httpConnector;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
|
||||
/**
|
||||
* TODO: delegate HTTP requests to the common HttpConnector2 implementation.
|
||||
*
|
||||
* @author js, Andreas Czerniak
|
||||
* @date 2020-04-09
|
||||
*
|
||||
*/
|
||||
public class RestCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
public static final String RESULT_SIZE_VALUE_DEFAULT = "100";
|
||||
|
||||
private final HttpClientParams clientParams;
|
||||
|
||||
public RestCollectorPlugin(HttpClientParams clientParams) {
|
||||
this.clientParams = clientParams;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
|
||||
final String baseUrl = api.getBaseUrl();
|
||||
|
||||
final String resumptionType = api.getParams().get("resumptionType");
|
||||
final String resumptionParam = api.getParams().get("resumptionParam");
|
||||
final String resumptionXpath = api.getParams().get("resumptionXpath");
|
||||
final String resultTotalXpath = api.getParams().get("resultTotalXpath");
|
||||
final String resultFormatParam = api.getParams().get("resultFormatParam");
|
||||
final String resultFormatValue = api.getParams().get("resultFormatValue");
|
||||
final String resultSizeParam = api.getParams().get("resultSizeParam");
|
||||
final String queryParams = api.getParams().get("queryParams");
|
||||
final String entityXpath = api.getParams().get("entityXpath");
|
||||
final String authMethod = api.getParams().get("authMethod");
|
||||
final String authToken = api.getParams().get("authToken");
|
||||
final String resultSizeValue = Optional
|
||||
.ofNullable(api.getParams().get("resultSizeValue"))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.orElse(RESULT_SIZE_VALUE_DEFAULT);
|
||||
|
||||
if (StringUtils.isBlank(baseUrl)) {
|
||||
throw new CollectorException("Param 'baseUrl' is null or empty");
|
||||
}
|
||||
if (StringUtils.isBlank(resumptionType)) {
|
||||
throw new CollectorException("Param 'resumptionType' is null or empty");
|
||||
}
|
||||
if (StringUtils.isBlank(resumptionParam)) {
|
||||
throw new CollectorException("Param 'resumptionParam' is null or empty");
|
||||
}
|
||||
if (StringUtils.isBlank(resultFormatValue)) {
|
||||
throw new CollectorException("Param 'resultFormatValue' is null or empty");
|
||||
}
|
||||
if (StringUtils.isBlank(queryParams)) {
|
||||
throw new CollectorException("Param 'queryParams' is null or empty");
|
||||
}
|
||||
if (StringUtils.isBlank(entityXpath)) {
|
||||
throw new CollectorException("Param 'entityXpath' is null or empty");
|
||||
}
|
||||
|
||||
final String resultOutputFormat = Optional
|
||||
.ofNullable(api.getParams().get("resultOutputFormat"))
|
||||
.map(String::toLowerCase)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.orElse(resultFormatValue.toLowerCase());
|
||||
|
||||
RestIterator it = new RestIterator(
|
||||
getClientParams(),
|
||||
baseUrl,
|
||||
resumptionType,
|
||||
resumptionParam,
|
||||
resumptionXpath,
|
||||
resultTotalXpath,
|
||||
resultFormatParam,
|
||||
resultFormatValue,
|
||||
resultSizeParam,
|
||||
resultSizeValue,
|
||||
queryParams,
|
||||
entityXpath,
|
||||
authMethod,
|
||||
authToken,
|
||||
resultOutputFormat);
|
||||
|
||||
return StreamSupport
|
||||
.stream(
|
||||
Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
|
||||
}
|
||||
|
||||
public HttpClientParams getClientParams() {
|
||||
return clientParams;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,411 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Iterator;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import javax.xml.xpath.*;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.HttpHeaders;
|
||||
import org.apache.http.entity.ContentType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import eu.dnetlib.dhp.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.collection.JsonUtils;
|
||||
|
||||
/**
|
||||
* log.info(...) equal to log.trace(...) in the application-logs
|
||||
* <p>
|
||||
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
|
||||
*
|
||||
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
|
||||
* @date 2020-04-09
|
||||
*
|
||||
*/
|
||||
public class RestIterator implements Iterator<String> {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
|
||||
public static final String UTF_8 = "UTF-8";
|
||||
|
||||
private final HttpClientParams clientParams;
|
||||
|
||||
private final String BASIC = "basic";
|
||||
|
||||
private final JsonUtils jsonUtils;
|
||||
|
||||
private final String baseUrl;
|
||||
private final String resumptionType;
|
||||
private final String resumptionParam;
|
||||
private final String resultFormatValue;
|
||||
private String queryParams;
|
||||
private final int resultSizeValue;
|
||||
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
||||
private int resultTotal = -1;
|
||||
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
|
||||
// or token scanned from results)
|
||||
private InputStream resultStream;
|
||||
private Transformer transformer;
|
||||
private XPath xpath;
|
||||
private String query;
|
||||
private XPathExpression xprResultTotalPath;
|
||||
private XPathExpression xprResumptionPath;
|
||||
private XPathExpression xprEntity;
|
||||
private final String queryFormat;
|
||||
private final String querySize;
|
||||
private final String authMethod;
|
||||
private final String authToken;
|
||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
|
||||
private int discoverResultSize = 0;
|
||||
private int pagination = 1;
|
||||
/*
|
||||
* While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in
|
||||
* json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in
|
||||
* json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format.
|
||||
*/
|
||||
private final String resultOutputFormat;
|
||||
|
||||
/** RestIterator class
|
||||
* compatible to version 1.3.33
|
||||
*/
|
||||
public RestIterator(
|
||||
final HttpClientParams clientParams,
|
||||
final String baseUrl,
|
||||
final String resumptionType,
|
||||
final String resumptionParam,
|
||||
final String resumptionXpath,
|
||||
final String resultTotalXpath,
|
||||
final String resultFormatParam,
|
||||
final String resultFormatValue,
|
||||
final String resultSizeParam,
|
||||
final String resultSizeValueStr,
|
||||
final String queryParams,
|
||||
final String entityXpath,
|
||||
final String authMethod,
|
||||
final String authToken,
|
||||
final String resultOutputFormat) {
|
||||
|
||||
this.clientParams = clientParams;
|
||||
this.jsonUtils = new JsonUtils();
|
||||
this.baseUrl = baseUrl;
|
||||
this.resumptionType = resumptionType;
|
||||
this.resumptionParam = resumptionParam;
|
||||
this.resultFormatValue = resultFormatValue;
|
||||
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
|
||||
this.queryParams = queryParams;
|
||||
this.authMethod = authMethod;
|
||||
this.authToken = authToken;
|
||||
this.resultOutputFormat = resultOutputFormat;
|
||||
|
||||
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
||||
: "";
|
||||
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
|
||||
|
||||
try {
|
||||
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
|
||||
} catch (Exception e) {
|
||||
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
||||
}
|
||||
initQueue();
|
||||
}
|
||||
|
||||
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
|
||||
throws TransformerConfigurationException, XPathExpressionException {
|
||||
transformer = TransformerFactory.newInstance().newTransformer();
|
||||
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
||||
xpath = XPathFactory.newInstance().newXPath();
|
||||
xprResultTotalPath = xpath.compile(resultTotalXpath);
|
||||
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
||||
xprEntity = xpath.compile(entityXpath);
|
||||
}
|
||||
|
||||
private void initQueue() {
|
||||
query = baseUrl + "?" + queryParams + querySize + queryFormat;
|
||||
log.info("REST calls starting with " + query);
|
||||
}
|
||||
|
||||
private void disconnect() {
|
||||
// TODO close inputstream
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see java.util.Iterator#hasNext()
|
||||
*/
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (recordQueue.isEmpty() && query.isEmpty()) {
|
||||
disconnect();
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see java.util.Iterator#next()
|
||||
*/
|
||||
@Override
|
||||
public String next() {
|
||||
synchronized (recordQueue) {
|
||||
while (recordQueue.isEmpty() && !query.isEmpty()) {
|
||||
try {
|
||||
query = downloadPage(query);
|
||||
} catch (CollectorException e) {
|
||||
log.debug("CollectorPlugin.next()-Exception: " + e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return recordQueue.poll();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* download page and return nextQuery
|
||||
*/
|
||||
private String downloadPage(String query) throws CollectorException {
|
||||
String resultJson;
|
||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||
String nextQuery = "";
|
||||
String emptyXml = resultXml + "<" + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">";
|
||||
Node resultNode = null;
|
||||
NodeList nodeList = null;
|
||||
String qUrlArgument = "";
|
||||
int urlOldResumptionSize = 0;
|
||||
InputStream theHttpInputStream;
|
||||
|
||||
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
||||
if (resumptionType.equalsIgnoreCase("deep-cursor")) {
|
||||
log.debug("check resumptionType deep-cursor and check cursor=*?" + query);
|
||||
if (!query.contains("&cursor=")) {
|
||||
query += "&cursor=*";
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
log.info("requestig URL [{}]", query);
|
||||
|
||||
URL qUrl = new URL(query);
|
||||
log.debug("authMethod :" + authMethod);
|
||||
if ("bearer".equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: " + resultXml);
|
||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: " + resultXml);
|
||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
|
||||
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else {
|
||||
theHttpInputStream = qUrl.openStream();
|
||||
}
|
||||
|
||||
resultStream = theHttpInputStream;
|
||||
if ("json".equals(resultOutputFormat)) {
|
||||
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
|
||||
resultXml = jsonUtils.convertToXML(resultJson);
|
||||
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||
}
|
||||
|
||||
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
|
||||
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
|
||||
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
||||
log.debug("nodeList.length: " + nodeList.getLength());
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
StringWriter sw = new StringWriter();
|
||||
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
||||
String toEnqueue = sw.toString();
|
||||
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
|
||||
log.warn("The following record resulted in empty item for the feeding queue: " + resultXml);
|
||||
} else {
|
||||
recordQueue.add(sw.toString());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log.warn("resultXml is equal with emptyXml");
|
||||
}
|
||||
|
||||
resumptionInt += resultSizeValue;
|
||||
|
||||
switch (resumptionType.toLowerCase()) {
|
||||
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
||||
resumptionStr = xprResumptionPath.evaluate(resultNode);
|
||||
break;
|
||||
|
||||
case "count": // begin at one step for all records, iterate over items
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
break;
|
||||
|
||||
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
||||
if (resultSizeValue < 2) {
|
||||
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
||||
}
|
||||
qUrlArgument = qUrl.getQuery();
|
||||
String[] arrayQUrlArgument = qUrlArgument.split("&");
|
||||
for (String arrayUrlArgStr : arrayQUrlArgument) {
|
||||
if (arrayUrlArgStr.startsWith(resumptionParam)) {
|
||||
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
||||
if (isInteger(resumptionKeyValue[1])) {
|
||||
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
||||
log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
|
||||
} else {
|
||||
log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (((emptyXml).equalsIgnoreCase(resultXml))
|
||||
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
|
||||
// resumptionStr = "";
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
}
|
||||
resultTotal = discoverResultSize;
|
||||
} else {
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
resultTotal = resumptionInt + 1;
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
}
|
||||
}
|
||||
log.info("discoverResultSize: {}", discoverResultSize);
|
||||
break;
|
||||
|
||||
case "pagination":
|
||||
case "page": // pagination, iterate over page numbers
|
||||
pagination += 1;
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
} else {
|
||||
resultTotal = discoverResultSize;
|
||||
pagination = discoverResultSize;
|
||||
}
|
||||
resumptionInt = pagination;
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
break;
|
||||
|
||||
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
|
||||
// solr)
|
||||
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
||||
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
||||
|
||||
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
|
||||
queryParams = queryParams.replace("&cursor=*", "");
|
||||
|
||||
// terminating if length of nodeList is 0
|
||||
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
|
||||
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
|
||||
} else {
|
||||
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
|
||||
// because the iteration is over
|
||||
// real length and the
|
||||
// resultSizeValue is added before
|
||||
// the switch()
|
||||
}
|
||||
|
||||
discoverResultSize = nodeList.getLength();
|
||||
|
||||
log
|
||||
.debug(
|
||||
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
|
||||
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
|
||||
|
||||
break;
|
||||
|
||||
default: // otherwise: abort
|
||||
// resultTotal = resumptionInt;
|
||||
break;
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("collection failed: " + e.getMessage());
|
||||
}
|
||||
|
||||
try {
|
||||
if (resultTotal == -1) {
|
||||
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
|
||||
if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
|
||||
resultTotal += 1;
|
||||
} // to correct the upper bound
|
||||
log.info("resultTotal was -1 is now: " + resultTotal);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
||||
}
|
||||
log.debug("resultTotal: " + resultTotal);
|
||||
log.debug("resInt: " + resumptionInt);
|
||||
if (resumptionInt <= resultTotal) {
|
||||
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
|
||||
+ queryFormat;
|
||||
} else {
|
||||
nextQuery = "";
|
||||
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
||||
// resumptionInt and prevent a NullPointer Exception at mdStore
|
||||
}
|
||||
log.debug("nextQueryUrl: " + nextQuery);
|
||||
return nextQuery;
|
||||
|
||||
}
|
||||
|
||||
private boolean isInteger(String s) {
|
||||
boolean isValidInteger = false;
|
||||
try {
|
||||
Integer.parseInt(s);
|
||||
|
||||
// s is a valid integer
|
||||
|
||||
isValidInteger = true;
|
||||
} catch (NumberFormatException ex) {
|
||||
// s is not an integer
|
||||
}
|
||||
|
||||
return isValidInteger;
|
||||
}
|
||||
|
||||
// Method to encode a string value using `UTF-8` encoding scheme
|
||||
private String encodeValue(String value) {
|
||||
try {
|
||||
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
|
||||
} catch (UnsupportedEncodingException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
public String getResultFormatValue() {
|
||||
return resultFormatValue;
|
||||
}
|
||||
|
||||
public String getResultOutputFormat() {
|
||||
return resultOutputFormat;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,139 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.worker;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
||||
import eu.dnetlib.message.Message;
|
||||
import eu.dnetlib.message.MessageManager;
|
||||
import eu.dnetlib.message.MessageType;
|
||||
|
||||
public class DnetCollectorWorker {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class);
|
||||
|
||||
private final CollectorPluginFactory collectorPluginFactory;
|
||||
|
||||
private final ArgumentApplicationParser argumentParser;
|
||||
|
||||
private final MessageManager manager;
|
||||
|
||||
public DnetCollectorWorker(
|
||||
final CollectorPluginFactory collectorPluginFactory,
|
||||
final ArgumentApplicationParser argumentParser,
|
||||
final MessageManager manager)
|
||||
throws DnetCollectorException {
|
||||
this.collectorPluginFactory = collectorPluginFactory;
|
||||
this.argumentParser = argumentParser;
|
||||
this.manager = manager;
|
||||
}
|
||||
|
||||
public void collect() throws DnetCollectorException {
|
||||
try {
|
||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
||||
final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class);
|
||||
|
||||
final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol());
|
||||
|
||||
final String hdfsuri = argumentParser.get("namenode");
|
||||
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS"));
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
// Get the filesystem - HDFS
|
||||
FileSystem.get(URI.create(hdfsuri), conf);
|
||||
Path hdfswritepath = new Path(argumentParser.get("hdfsPath"));
|
||||
|
||||
log.info("Created path " + hdfswritepath.toString());
|
||||
|
||||
final Map<String, String> ongoingMap = new HashMap<>();
|
||||
final Map<String, String> reportMap = new HashMap<>();
|
||||
final AtomicInteger counter = new AtomicInteger(0);
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfswritepath),
|
||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
final IntWritable key = new IntWritable(counter.get());
|
||||
final Text value = new Text();
|
||||
plugin
|
||||
.collect(api)
|
||||
.forEach(
|
||||
content -> {
|
||||
key.set(counter.getAndIncrement());
|
||||
value.set(content);
|
||||
if (counter.get() % 10 == 0) {
|
||||
try {
|
||||
ongoingMap.put("ongoing", "" + counter.get());
|
||||
log
|
||||
.debug(
|
||||
"Sending message: "
|
||||
+ manager
|
||||
.sendMessage(
|
||||
new Message(
|
||||
argumentParser.get("workflowId"),
|
||||
"Collection",
|
||||
MessageType.ONGOING,
|
||||
ongoingMap),
|
||||
argumentParser.get("rabbitOngoingQueue"),
|
||||
true,
|
||||
false));
|
||||
} catch (Exception e) {
|
||||
log.error("Error on sending message ", e);
|
||||
}
|
||||
}
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
ongoingMap.put("ongoing", "" + counter.get());
|
||||
manager
|
||||
.sendMessage(
|
||||
new Message(
|
||||
argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap),
|
||||
argumentParser.get("rabbitOngoingQueue"),
|
||||
true,
|
||||
false);
|
||||
reportMap.put("collected", "" + counter.get());
|
||||
manager
|
||||
.sendMessage(
|
||||
new Message(
|
||||
argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
|
||||
argumentParser.get("rabbitOngoingQueue"),
|
||||
true,
|
||||
false);
|
||||
manager.close();
|
||||
} catch (Throwable e) {
|
||||
throw new DnetCollectorException("Error on collecting ", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.worker;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
||||
import eu.dnetlib.message.MessageManager;
|
||||
|
||||
/**
|
||||
* DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module
|
||||
* will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector
|
||||
* plugin to use and where store the data into HDFS path
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
public class DnetCollectorWorkerApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class);
|
||||
|
||||
private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory();
|
||||
|
||||
private static ArgumentApplicationParser argumentParser;
|
||||
|
||||
/** @param args */
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
argumentParser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
DnetCollectorWorker.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/collector/worker/collector_parameter.json")));
|
||||
argumentParser.parseArgument(args);
|
||||
log.info("hdfsPath =" + argumentParser.get("hdfsPath"));
|
||||
log.info("json = " + argumentParser.get("apidescriptor"));
|
||||
final MessageManager manager = new MessageManager(
|
||||
argumentParser.get("rabbitHost"),
|
||||
argumentParser.get("rabbitUser"),
|
||||
argumentParser.get("rabbitPassword"),
|
||||
false,
|
||||
false,
|
||||
null);
|
||||
final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager);
|
||||
worker.collect();
|
||||
}
|
||||
}
|
|
@ -1,19 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.worker.utils;
|
||||
|
||||
import java.util.LinkedList;
|
||||
|
||||
public class CollectorPluginErrorLogList extends LinkedList<String> {
|
||||
|
||||
private static final long serialVersionUID = -6925786561303289704L;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
String log = "";
|
||||
int index = 0;
|
||||
for (final String errorMessage : this) {
|
||||
log += String.format("Retry #%s: %s / ", index++, errorMessage);
|
||||
}
|
||||
return log;
|
||||
}
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.worker.utils;
|
||||
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
||||
|
||||
public class CollectorPluginFactory {
|
||||
|
||||
public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException {
|
||||
if (protocol == null)
|
||||
throw new DnetCollectorException("protocol cannot be null");
|
||||
switch (protocol.toLowerCase().trim()) {
|
||||
case "oai":
|
||||
return new OaiCollectorPlugin();
|
||||
default:
|
||||
throw new DnetCollectorException("UNknown protocol");
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,244 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.worker.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.*;
|
||||
import java.security.GeneralSecurityException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.net.ssl.HttpsURLConnection;
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.math.NumberUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
||||
|
||||
public class HttpConnector {
|
||||
|
||||
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
||||
|
||||
private int maxNumberOfRetry = 6;
|
||||
private int defaultDelay = 120; // seconds
|
||||
private int readTimeOut = 120; // seconds
|
||||
|
||||
private String responseType = null;
|
||||
|
||||
private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
||||
|
||||
public HttpConnector() {
|
||||
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content via HTTP GET
|
||||
*
|
||||
* @param requestUrl the URL
|
||||
* @return the content of the downloaded resource
|
||||
* @throws DnetCollectorException when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public String getInputSource(final String requestUrl) throws DnetCollectorException {
|
||||
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content as a stream via HTTP GET
|
||||
*
|
||||
* @param requestUrl the URL
|
||||
* @return the content of the downloaded resource as InputStream
|
||||
* @throws DnetCollectorException when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException {
|
||||
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
}
|
||||
|
||||
private String attemptDownlaodAsString(
|
||||
final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
||||
throws DnetCollectorException {
|
||||
try {
|
||||
final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
try {
|
||||
return IOUtils.toString(s);
|
||||
} catch (final IOException e) {
|
||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(e.getMessage());
|
||||
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(s);
|
||||
}
|
||||
} catch (final InterruptedException e) {
|
||||
throw new DnetCollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private InputStream attemptDownload(
|
||||
final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
||||
throws DnetCollectorException {
|
||||
|
||||
if (retryNumber > maxNumberOfRetry) {
|
||||
throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList);
|
||||
}
|
||||
|
||||
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
||||
try {
|
||||
InputStream input = null;
|
||||
|
||||
try {
|
||||
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
||||
urlConn.setInstanceFollowRedirects(false);
|
||||
urlConn.setReadTimeout(readTimeOut * 1000);
|
||||
urlConn.addRequestProperty("User-Agent", userAgent);
|
||||
|
||||
if (log.isDebugEnabled()) {
|
||||
logHeaderFields(urlConn);
|
||||
}
|
||||
|
||||
final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
||||
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
||||
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
||||
Thread.sleep(retryAfter * 1000);
|
||||
errorList.add("503 Service Unavailable");
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
} else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM
|
||||
|| urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
|
||||
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
||||
log.debug("The requested url has been moved to " + newUrl);
|
||||
errorList
|
||||
.add(
|
||||
String
|
||||
.format(
|
||||
"%s %s. Moved to: %s",
|
||||
urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
||||
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
||||
log
|
||||
.error(
|
||||
String
|
||||
.format(
|
||||
"HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList
|
||||
.add(
|
||||
String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
} else {
|
||||
input = urlConn.getInputStream();
|
||||
responseType = urlConn.getContentType();
|
||||
return input;
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(e.getMessage());
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
}
|
||||
} catch (final InterruptedException e) {
|
||||
throw new DnetCollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
||||
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
||||
|
||||
for (final Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
||||
if (e.getKey() != null) {
|
||||
for (final String v : e.getValue()) {
|
||||
log.debug(" key: " + e.getKey() + " - value: " + v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
||||
for (final String key : headerMap.keySet()) {
|
||||
if (key != null
|
||||
&& key.toLowerCase().equals("retry-after")
|
||||
&& headerMap.get(key).size() > 0
|
||||
&& NumberUtils.isNumber(headerMap.get(key).get(0))) {
|
||||
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private String obtainNewLocation(final Map<String, List<String>> headerMap)
|
||||
throws DnetCollectorException {
|
||||
for (final String key : headerMap.keySet()) {
|
||||
if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) {
|
||||
return headerMap.get(key).get(0);
|
||||
}
|
||||
}
|
||||
throw new DnetCollectorException(
|
||||
"The requested url has been MOVED, but 'location' param is MISSING");
|
||||
}
|
||||
|
||||
/**
|
||||
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
||||
*/
|
||||
public void initTrustManager() {
|
||||
final X509TrustManager tm = new X509TrustManager() {
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
try {
|
||||
final SSLContext ctx = SSLContext.getInstance("TLS");
|
||||
ctx.init(null, new TrustManager[] {
|
||||
tm
|
||||
}, null);
|
||||
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
||||
} catch (final GeneralSecurityException e) {
|
||||
log.fatal(e);
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public int getMaxNumberOfRetry() {
|
||||
return maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
||||
this.maxNumberOfRetry = maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public int getDefaultDelay() {
|
||||
return defaultDelay;
|
||||
}
|
||||
|
||||
public void setDefaultDelay(final int defaultDelay) {
|
||||
this.defaultDelay = defaultDelay;
|
||||
}
|
||||
|
||||
public int getReadTimeOut() {
|
||||
return readTimeOut;
|
||||
}
|
||||
|
||||
public void setReadTimeOut(final int readTimeOut) {
|
||||
this.readTimeOut = readTimeOut;
|
||||
}
|
||||
|
||||
public String getResponseType() {
|
||||
return responseType;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
package eu.dnetlib.dhp.transformation;
|
||||
|
||||
public class DnetTransformationException extends Exception {
|
||||
|
||||
public DnetTransformationException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public DnetTransformationException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public DnetTransformationException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public DnetTransformationException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public DnetTransformationException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.transformation;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.transformation.functions.Cleaner;
|
||||
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
|
||||
import net.sf.saxon.s9api.*;
|
||||
|
||||
public class TransformFunction implements MapFunction<MetadataRecord, MetadataRecord> {
|
||||
|
||||
private final LongAccumulator totalItems;
|
||||
private final LongAccumulator errorItems;
|
||||
private final LongAccumulator transformedItems;
|
||||
private final String transformationRule;
|
||||
private final Cleaner cleanFunction;
|
||||
|
||||
private final long dateOfTransformation;
|
||||
|
||||
public TransformFunction(
|
||||
LongAccumulator totalItems,
|
||||
LongAccumulator errorItems,
|
||||
LongAccumulator transformedItems,
|
||||
final String transformationRule,
|
||||
long dateOfTransformation,
|
||||
final Map<String, Vocabulary> vocabularies)
|
||||
throws Exception {
|
||||
this.totalItems = totalItems;
|
||||
this.errorItems = errorItems;
|
||||
this.transformedItems = transformedItems;
|
||||
this.transformationRule = transformationRule;
|
||||
this.dateOfTransformation = dateOfTransformation;
|
||||
cleanFunction = new Cleaner(vocabularies);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MetadataRecord call(MetadataRecord value) {
|
||||
totalItems.add(1);
|
||||
try {
|
||||
Processor processor = new Processor(false);
|
||||
processor.registerExtensionFunction(cleanFunction);
|
||||
final XsltCompiler comp = processor.newXsltCompiler();
|
||||
XsltExecutable xslt = comp
|
||||
.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes())));
|
||||
XdmNode source = processor
|
||||
.newDocumentBuilder()
|
||||
.build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes())));
|
||||
XsltTransformer trans = xslt.load();
|
||||
trans.setInitialContextNode(source);
|
||||
final StringWriter output = new StringWriter();
|
||||
Serializer out = processor.newSerializer(output);
|
||||
out.setOutputProperty(Serializer.Property.METHOD, "xml");
|
||||
out.setOutputProperty(Serializer.Property.INDENT, "yes");
|
||||
trans.setDestination(out);
|
||||
trans.transform();
|
||||
final String xml = output.toString();
|
||||
value.setBody(xml);
|
||||
value.setDateOfTransformation(dateOfTransformation);
|
||||
transformedItems.add(1);
|
||||
return value;
|
||||
} catch (Throwable e) {
|
||||
errorItems.add(1);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue