implemented import crossref job

2020-04-01 14:12:33 +02:00 · 2020-04-01 14:12:33 +02:00 · 205e9521c6
parent 36236dd1c1
commit 205e9521c6
14 changed files with 912 additions and 372 deletions
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
@ -75,10 +75,10 @@ public class SparkCreateDedupTest {
        final HashFunction hashFunction = Hashing.murmur3_128();
-        System.out.println( s1.hashCode());
+//        System.out.println( s1.hashCode());
-        System.out.println(hashFunction.hashUnencodedChars(s1).asLong());
+//        System.out.println(hashFunction.hashUnencodedChars(s1).asLong());
-        System.out.println( s2.hashCode());
+//        System.out.println( s2.hashCode());
-        System.out.println(hashFunction.hashUnencodedChars(s2).asLong());
+//        System.out.println(hashFunction.hashUnencodedChars(s2).asLong());
    }
--- a/dhp-workflows/dhp-doiboost/pom.xml
+++ b/dhp-workflows/dhp-doiboost/pom.xml
@ -0,0 +1,53 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>dhp-workflows</artifactId>
        <groupId>eu.dnetlib.dhp</groupId>
        <version>1.1.6-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>
    <artifactId>dhp-doiboost</artifactId>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.3.4</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-common</artifactId>
            <version>${project.version}</version>
           <exclusions>
               <exclusion>
                   <groupId>org.apache.cxf</groupId>
                   <artifactId>cxf-rt-transports-http</artifactId>
               </exclusion>
           </exclusions>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-schemas</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
        </dependency>
    </dependencies>
 </project>
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/CrossrefImporter.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/CrossrefImporter.java
@ -0,0 +1,65 @@
 package eu.dnetlib.doiboost;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.http.HttpHost;
 public class CrossrefImporter {
    public static void main(String[] args) throws Exception {
        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefImporter.class.getResourceAsStream("/eu/dnetlib/dhp/doiboost/import_from_es.json")));
        parser.parseArgument(args);
        System.out.println(parser.get("targetPath"));
        final String hdfsuri = parser.get("namenode");
        System.out.println(hdfsuri);
        Path hdfswritepath = new Path(parser.get("targetPath"));
        // ====== Init HDFS File System Object
        Configuration conf = new Configuration();
        // Set FileSystem URI
        conf.set("fs.defaultFS", hdfsuri);
        // Because of Maven
        conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
        conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
        ESClient client = new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
        try (SequenceFile.Writer writer = SequenceFile.createWriter(conf,
                SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class),
                SequenceFile.Writer.valueClass(Text.class))) {
            int i = 0;
            long start= System.currentTimeMillis();
            long end = 0;
            final IntWritable key = new IntWritable(i);
            final Text value = new Text();
            while (client.hasNext()) {
                key.set(i++);
                value.set(client.next());
                writer.append(key, value);
                if (i % 100000 == 0) {
                    end = System.currentTimeMillis();
                    final float time = (end - start) / 1000;
                    System.out.println(String.format("Imported %d records last 100000 imported in %f seconds", i, time));
                    start = System.currentTimeMillis();
                }
            }
        }
    }
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/ESClient.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/ESClient.java
@ -0,0 +1,103 @@
 package eu.dnetlib.doiboost;
 import com.jayway.jsonpath.JsonPath;
 import org.apache.commons.io.IOUtils;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpPost;
 import org.apache.http.entity.StringEntity;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
 import java.io.IOException;
 import java.util.Iterator;
 import java.util.List;
 public class ESClient implements Iterator<String> {
    final static String blobPath = "$.hits[*].hits[*]._source.blob";
    final static String scrollIdPath = "$._scroll_id";
    String scrollId;
    List<String> buffer;
    final String esHost;
    final String esIndex;
    public ESClient(final String esHost, final String esIndex) throws IOException {
        this.esHost = esHost;
        this.esIndex = esIndex;
        final String body =getResponse(String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), "{\"size\":1000}");
        scrollId= getJPathString(scrollIdPath, body);
        buffer = getBlobs(body);
    }
    private String getResponse(final String url,final String json ) {
        CloseableHttpClient client = HttpClients.createDefault();
        try {
            HttpPost httpPost = new HttpPost(url);
            if (json!= null) {
                StringEntity entity = new StringEntity(json);
                httpPost.setEntity(entity);
                httpPost.setHeader("Accept", "application/json");
                httpPost.setHeader("Content-type", "application/json");
            }
            CloseableHttpResponse response = client.execute(httpPost);
            return IOUtils.toString(response.getEntity().getContent());
        } catch (Throwable e) {
            throw new RuntimeException("Error on executing request ",e);
        } finally {
            try {
                client.close();
            } catch (IOException e) {
                throw new RuntimeException("Unable to close client ",e);
            }
        }
    }
    private String  getJPathString(final String jsonPath, final String json) {
        try {
            Object o = JsonPath.read(json, jsonPath);
            if (o instanceof String)
                return (String) o;
            return null;
        } catch (Exception e) {
            return "";
        }
    }
    private List<String> getBlobs(final String body) {
        final List<String > res = JsonPath.read(body, "$.hits.hits[*]._source.blob");
        return res;
    }
    @Override
    public boolean hasNext() {
        return (buffer!= null && !buffer.isEmpty());
    }
    @Override
    public String next() {
        final String nextItem = buffer.remove(0);
        if (buffer.isEmpty()) {
            final String json_param = String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId);
            final String body =getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
            try {
                buffer = getBlobs(body);
            } catch (Throwable e) {
                System.out.println(body);
            }
        }
        return nextItem;
    }
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/Journal.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/Journal.scala
@ -0,0 +1,16 @@
 package eu.dnetlib.doiboost
 case class Journal(
                    JournalId: Long,
                    Rank: Int,
                    NormalizedName: String,
                    DisplayName: String,
                    Issn: String,
                    Publisher: String,
                    Webpage: String,
                    PaperCount: Long,
                    CitationCount: Long,
                    CreatedDate: String
                  )
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkDownloadContentFromCrossref.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkDownloadContentFromCrossref.scala
@ -0,0 +1,49 @@
 package eu.dnetlib.doiboost
 //import org.apache.spark.SparkConf
 //import org.apache.spark.sql.{Dataset, Encoders, Row, SparkSession}
 //
 //object SparkDownloadContentFromCrossref {
 //
 //
 //  def main(args: Array[String]): Unit = {
 //
 //
 //    val conf: SparkConf = new SparkConf().setAppName("DownloadContentFromCrossref").setMaster("local[*]")
 //
 //    val spark = SparkSession.builder().config(conf).getOrCreate()
 //
 //
 //    val sc = spark.sparkContext
 //    import spark.implicits._
 //    spark.read.option("header", "false")
 //      .option("delimiter", "\t")
 //      .csv("/Users/sandro/Downloads/doiboost/mag_Journals.txt.gz")
 //
 //
 //    val d = spark.read.option("header", "false")
 //      .option("delimiter", "\t")
 //      .csv("/Users/sandro/Downloads/doiboost/mag_Journals.txt.gz")
 //      .map(f =>
 //        Journal( f.getAs[String](0).toLong, f.getAs[String](1).toInt, f.getAs[String](2),
 //          f.getAs[String](3), f.getAs[String](4), f.getAs[String](5), f.getAs[String](6),
 //          f.getAs[String](7).toLong, f.getAs[String](8).toLong, f.getAs[String](9)
 //        ))
 //
 //    d.show()
 //
 //    d.printSchema()
 //
 //
 //
 //
 //
 //
 //
 //
 //  }
 //
 //
 //}
 //
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/application/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/application/oozie_app/config-default.xml
@ -0,0 +1,18 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.java</name>
        <value>spark2</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/application/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/application/oozie_app/workflow.xml
@ -0,0 +1,39 @@
 <workflow-app name="import Crossref from index into HDFS" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>workingPath</name>
            <description>the working dir base path</description>
        </property>
    </parameters>
    <start to="ResetWorkingPath"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="ResetWorkingPath">
        <fs>
            <delete path='${workingPath}'/>
            <mkdir path='${workingPath}/input/crossref'/>
        </fs>
        <ok to="ImportCrossRef"/>
        <error to="Kill"/>
    </action>
    <action name="ImportCrossRef">
        <java>
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <main-class>eu.dnetlib.doiboost.CrossrefImporter</main-class>
            <arg>-t</arg><arg>${workingPath}/input/crossref/index_dump</arg>
            <arg>-n</arg><arg>${nameNode}</arg>
        </java>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_from_es.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_from_es.json
@ -0,0 +1,4 @@
 [
  {"paramName":"t",   "paramLongName":"targetPath",         "paramDescription": "the path of the sequencial file to write",                  "paramRequired": true},
  {"paramName":"n",   "paramLongName":"namenode",           "paramDescription": "the hive metastore uris",                                  "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java
@ -0,0 +1,61 @@
 package eu.dnetlib.doiboost;
 import com.jayway.jsonpath.JsonPath;
 import org.apache.commons.io.IOUtils;
 import org.junit.Assert;
 import org.junit.Ignore;
 import org.junit.Test;
 import java.io.IOException;
 import java.util.List;
 public class DoiBoostTest {
    @Test
    @Ignore
    public void test() throws Exception {
        //SparkDownloadContentFromCrossref.main(null);
        CrossrefImporter.main(new String[]{
                "-n","file:///tmp",
                "-t","file:///tmp/p.seq",
        });
    }
    @Test
    public void testPath() throws Exception {
        final String json = IOUtils.toString(getClass().getResourceAsStream("response.json"));
        final List<String > res = JsonPath.read(json, "$.hits.hits[*]._source.blob");
        System.out.println(res.size());
    }
    @Test
    @Ignore
    public void testParseResponse() throws IOException {
        long end, start = System.currentTimeMillis();
        ESClient client = new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
        int i = 0;
        while (client.hasNext()) {
            Assert.assertNotNull(client.next());
            i++;
            if(i % 1000 == 0) {
                end = System.currentTimeMillis();
                System.out.println("Vel 1000 records in "+((end -start)/1000)+"s");
                start = System.currentTimeMillis();
            }
            if (i >1000000)
                break;
        }
    }
 }
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/response.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/response.json
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/s.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/s.json
--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@ -19,6 +19,7 @@
        <module>dhp-graph-mapper</module>
        <module>dhp-dedup</module>
        <module>dhp-graph-provision</module>
        <module>dhp-doiboost</module>
    </modules>
    <pluginRepositories>