forked from D-Net/dnet-hadoop
[DOIBoost] configurable ES server url and index name in crossref importer
This commit is contained in:
parent
26e9d55c13
commit
8879704ba0
|
@ -2,18 +2,16 @@
|
|||
package eu.dnetlib.doiboost.crossref;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.Optional;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
|
@ -30,34 +28,45 @@ public class CrossrefImporter {
|
|||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String hdfsuri = parser.get("namenode");
|
||||
System.out.println("HDFS URI" + hdfsuri);
|
||||
Path hdfswritepath = new Path(parser.get("targetPath"));
|
||||
System.out.println("TargetPath: " + hdfsuri);
|
||||
final String namenode = parser.get("namenode");
|
||||
System.out.println("namenode: " + namenode);
|
||||
|
||||
final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp"))
|
||||
? Long.parseLong(parser.get("timestamp"))
|
||||
: -1;
|
||||
Path targetPath = new Path(parser.get("targetPath"));
|
||||
System.out.println("targetPath: " + targetPath);
|
||||
|
||||
if (timestamp > 0)
|
||||
System.out.println("Timestamp added " + timestamp);
|
||||
final Long timestamp = Optional
|
||||
.ofNullable(parser.get("timestamp"))
|
||||
.map(s -> {
|
||||
try {
|
||||
return Long.parseLong(s);
|
||||
} catch (NumberFormatException e) {
|
||||
return -1L;
|
||||
}
|
||||
})
|
||||
.orElse(-1L);
|
||||
System.out.println("timestamp: " + timestamp);
|
||||
|
||||
final String esServer = parser.get("esServer");
|
||||
System.out.println("esServer: " + esServer);
|
||||
|
||||
final String esIndex = parser.get("esIndex");
|
||||
System.out.println("esIndex: " + esIndex);
|
||||
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri);
|
||||
conf.set("fs.defaultFS", namenode);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
ESClient client = timestamp > 0
|
||||
? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp)
|
||||
: new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
|
||||
// "ip-90-147-167-25.ct1.garrservices.it", "crossref"
|
||||
final ESClient client = new ESClient(esServer, esIndex, timestamp);
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfswritepath),
|
||||
SequenceFile.Writer.file(targetPath),
|
||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
|
||||
|
@ -74,8 +83,7 @@ public class CrossrefImporter {
|
|||
end = System.currentTimeMillis();
|
||||
final float time = (end - start) / 1000.0F;
|
||||
System.out
|
||||
.println(
|
||||
String.format("Imported %d records last 100000 imported in %f seconds", i, time));
|
||||
.println(String.format("Imported %s records last 100000 imported in %s seconds", i, time));
|
||||
start = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
|
||||
package eu.dnetlib.doiboost.crossref;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.HttpHeaders;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpPost;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
|
@ -17,13 +17,17 @@ import org.slf4j.LoggerFactory;
|
|||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
public class ESClient implements Iterator<String> {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ESClient.class);
|
||||
|
||||
static final String blobPath = "$.hits[*].hits[*]._source.blob";
|
||||
static final String scrollIdPath = "$._scroll_id";
|
||||
static final String JSON_NO_TS = "{\"size\":1000}";
|
||||
static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
|
||||
static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
|
||||
private static final String BLOB_PATH = "$.hits.hits[*]._source.blob";
|
||||
private static final String SCROLL_ID_PATH = "$._scroll_id";
|
||||
private static final String JSON_NO_TS = "{\"size\":1000}";
|
||||
private static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
|
||||
private static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
|
||||
|
||||
public static final String APPLICATION_JSON = "application/json";
|
||||
|
||||
public static final String ES_SEARCH_URL = "http://%s:9200/%s/_search?scroll=1m";
|
||||
public static final String ES_SCROLL_URL = "http://%s:9200/_search/scroll";
|
||||
|
||||
private final String scrollId;
|
||||
|
||||
|
@ -31,47 +35,30 @@ public class ESClient implements Iterator<String> {
|
|||
|
||||
private final String esHost;
|
||||
|
||||
public ESClient(final String esHost, final String esIndex) throws IOException {
|
||||
|
||||
public ESClient(final String esHost, final String esIndex, final long timestamp) {
|
||||
this.esHost = esHost;
|
||||
final String body = getResponse(
|
||||
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS);
|
||||
scrollId = getJPathString(scrollIdPath, body);
|
||||
buffer = getBlobs(body);
|
||||
}
|
||||
|
||||
public ESClient(final String esHost, final String esIndex, final long timestamp)
|
||||
throws IOException {
|
||||
this.esHost = esHost;
|
||||
final String body = getResponse(
|
||||
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
|
||||
String.format(JSON_WITH_TS, timestamp));
|
||||
scrollId = getJPathString(scrollIdPath, body);
|
||||
final String body = timestamp > 0
|
||||
? getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), String.format(JSON_WITH_TS, timestamp))
|
||||
: getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), JSON_NO_TS);
|
||||
scrollId = getJPathString(SCROLL_ID_PATH, body);
|
||||
buffer = getBlobs(body);
|
||||
}
|
||||
|
||||
private String getResponse(final String url, final String json) {
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
try {
|
||||
|
||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
HttpPost httpPost = new HttpPost(url);
|
||||
if (json != null) {
|
||||
StringEntity entity = new StringEntity(json);
|
||||
httpPost.setEntity(entity);
|
||||
httpPost.setHeader("Accept", "application/json");
|
||||
httpPost.setHeader("Content-type", "application/json");
|
||||
httpPost.setHeader(HttpHeaders.ACCEPT, APPLICATION_JSON);
|
||||
httpPost.setHeader(HttpHeaders.CONTENT_TYPE, APPLICATION_JSON);
|
||||
}
|
||||
CloseableHttpResponse response = client.execute(httpPost);
|
||||
|
||||
try (CloseableHttpResponse response = client.execute(httpPost)) {
|
||||
return IOUtils.toString(response.getEntity().getContent());
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("Error on executing request ", e);
|
||||
} finally {
|
||||
try {
|
||||
client.close();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to close client ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -87,7 +74,7 @@ public class ESClient implements Iterator<String> {
|
|||
}
|
||||
|
||||
private List<String> getBlobs(final String body) {
|
||||
final List<String> res = JsonPath.read(body, "$.hits.hits[*]._source.blob");
|
||||
final List<String> res = JsonPath.read(body, BLOB_PATH);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -102,11 +89,11 @@ public class ESClient implements Iterator<String> {
|
|||
if (buffer.isEmpty()) {
|
||||
|
||||
final String json_param = String.format(JSON_SCROLL, scrollId);
|
||||
final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
|
||||
final String body = getResponse(String.format(ES_SCROLL_URL, esHost), json_param);
|
||||
try {
|
||||
buffer = getBlobs(body);
|
||||
} catch (Throwable e) {
|
||||
logger.error("Error on get next page: body:" + body);
|
||||
System.out.println("Error on get next page: body:" + body);
|
||||
}
|
||||
}
|
||||
return nextItem;
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the sequencial file to write", "paramRequired": true},
|
||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the hive metastore uris", "paramRequired": true},
|
||||
{"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false}
|
||||
{"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false},
|
||||
{"paramName":"ess", "paramLongName":"esServer", "paramDescription": "elasticsearch server url", "paramRequired": true},
|
||||
{"paramName":"esi", "paramLongName":"esIndex", "paramDescription": "elasticsearch index name", "paramRequired": true}
|
||||
]
|
|
@ -36,7 +36,6 @@
|
|||
|
||||
|
||||
<!-- Crossref Parameters -->
|
||||
|
||||
<property>
|
||||
<name>inputPathCrossref</name>
|
||||
<description>the Crossref input path</description>
|
||||
|
@ -45,6 +44,14 @@
|
|||
<name>crossrefTimestamp</name>
|
||||
<description>Timestamp for the Crossref incremental Harvesting</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esServer</name>
|
||||
<description>elasticsearch server url for the Crossref Harvesting</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esIndex</name>
|
||||
<description>elasticsearch index name for the Crossref Harvesting</description>
|
||||
</property>
|
||||
|
||||
<!-- MAG Parameters -->
|
||||
<property>
|
||||
|
@ -65,11 +72,19 @@
|
|||
<description>the ORCID working path</description>
|
||||
</property>
|
||||
|
||||
|
||||
|
||||
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resume_from"/>
|
||||
|
||||
<decision name="resume_from">
|
||||
|
@ -85,20 +100,18 @@
|
|||
</switch>
|
||||
</decision>
|
||||
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ImportCrossRef">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
|
||||
<arg>-t</arg><arg>${inputPathCrossref}/index_update</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-ts</arg><arg>${timestamp}</arg>
|
||||
<arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--esServer</arg><arg>${esServer}</arg>
|
||||
<arg>--esIndex</arg><arg>${esIndex}</arg>
|
||||
<arg>--timestamp</arg><arg>${crossrefTimestamp}</arg>
|
||||
</java>
|
||||
<ok to="GenerateCrossrefDataset"/>
|
||||
<error to="Kill"/>
|
||||
|
|
Loading…
Reference in New Issue