opendoarID api using hdfs

This commit is contained in:
Michele Artini 2020-11-09 10:38:18 +01:00
parent 96586f7b33
commit baad77945f
7 changed files with 365 additions and 42 deletions

View File

@ -1,5 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>eu.dnetlib.dhp</groupId>
@ -22,7 +24,40 @@
<artifactId>dnet-broker-apps-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0-cdh5.9.2</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
<repositories>
<repository>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
<id>cloudera</id>
<name>Cloudera Repository</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
</repository>
</repositories>
<build>
<plugins>

View File

@ -42,12 +42,7 @@ public class BrokerConfiguration extends AbstractElasticsearchConfiguration {
@Autowired
private ElasticSearchProperties elasticSearchProperties;
public static final String TAG_EVENTS = "Events";
public static final String TAG_SUBSCRIPTIONS = "Subscriptions";
public static final String TAG_NOTIFICATIONS = "Notifications";
public static final String TAG_TOPIC_TYPES = "Topic Types";
public static final String TAG_OPENAIRE = "OpenAIRE";
public static final String TAG_MATCHING = "Subscription-Event Matching";
public static final String OA_PUBLIC_APIS = "Openaire Broker Public API";
@Override
@Bean
@ -67,10 +62,9 @@ public class BrokerConfiguration extends AbstractElasticsearchConfiguration {
return new Docket(DocumentationType.SWAGGER_2)
.select()
.apis(RequestHandlerSelectors.any())
.paths(p -> p.startsWith("/api/"))
.paths(p -> p.startsWith("/"))
.build()
.tags(new Tag(TAG_EVENTS, "Events management"), new Tag(TAG_SUBSCRIPTIONS, "Subscriptions management"), new Tag(TAG_NOTIFICATIONS,
"Notifications management"), new Tag(TAG_TOPIC_TYPES, "Topic types management"), new Tag(TAG_OPENAIRE, "OpenAIRE use case"))
.tags(new Tag(OA_PUBLIC_APIS, OA_PUBLIC_APIS))
.apiInfo(new ApiInfo(
"Literature Broker Service",
"APIs documentation",

View File

@ -1,5 +1,8 @@
package eu.dnetlib.broker.oa.controllers;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -15,10 +18,15 @@ import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.index.query.QueryBuilders;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Profile;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.elasticsearch.core.ElasticsearchOperations;
@ -49,7 +57,7 @@ import io.swagger.annotations.ApiOperation;
@Profile("openaire")
@RestController
@RequestMapping("/")
@Api(tags = BrokerConfiguration.TAG_OPENAIRE)
@Api(tags = BrokerConfiguration.OA_PUBLIC_APIS)
public class OpenairePublicController extends AbstractLbsController {
@Autowired
@ -61,6 +69,12 @@ public class OpenairePublicController extends AbstractLbsController {
@Autowired
private ElasticSearchProperties props;
@Value("${lbs.hadoop.namenode.url}")
private String namenode;
@Value("${lbs.hadoop.opendoar.events.path}")
private String opendoarEventsPath;
private static final long SCROLL_TIMEOUT_IN_MILLIS = 5 * 60 * 1000;
private static final Log log = LogFactory.getLog(OpenairePublicController.class);
@ -98,36 +112,6 @@ public class OpenairePublicController extends AbstractLbsController {
}
}
@ApiOperation("Returns notifications by opendorar Id (for example: 301) using scrolls (first page)")
@GetMapping("/scroll/notifications/byOpenDoarId/{opendoarId}")
public ScrollPage<ShortEventMessage> prepareScrollNotificationsByOpendoarId(@PathVariable final String opendoarId) {
final ElasticsearchRestTemplate esTemplate = (ElasticsearchRestTemplate) esOperations;
final String dsId = calculateDsIdFromOpenDoarId(opendoarId);
final NativeSearchQuery searchQuery = new NativeSearchQueryBuilder()
.withQuery(QueryBuilders.nestedQuery("map", QueryBuilders.termQuery("map.targetDatasourceId", dsId), ScoreMode.None))
.withSearchType(SearchType.DEFAULT)
.withFields("topic", "payload")
.build();
final SearchScrollHits<Notification> scroll =
esTemplate.searchScrollStart(SCROLL_TIMEOUT_IN_MILLIS, searchQuery, Notification.class, IndexCoordinates.of(props.getNotificationsIndexName()));
if (scroll.hasSearchHits()) {
final List<ShortEventMessage> values = calculateNotificationMessages(scroll);
return new ScrollPage<>(scroll.getScrollId(), values.isEmpty() || scroll.getScrollId() == null, values);
} else {
esTemplate.searchScrollClear(Arrays.asList(scroll.getScrollId()));
return new ScrollPage<>(null, true, new ArrayList<>());
}
}
private String calculateDsIdFromOpenDoarId(final String opendoarId) {
return "10|opendoar____::" + DigestUtils.md5Hex(opendoarId);
}
@ApiOperation("Returns notifications using scrolls (other pages)")
@GetMapping("/scroll/notifications/{scrollId}")
public ScrollPage<ShortEventMessage> scrollNotifications(@PathVariable final String scrollId) {
@ -182,6 +166,61 @@ public class OpenairePublicController extends AbstractLbsController {
}
@ApiOperation("Returns events as file by opendoarId")
@GetMapping(value = "/file/events/opendoar/{id}", produces = "application/gzip")
public void opendoarEventsAsFile(final HttpServletResponse res, @PathVariable final String id) {
res.setHeader("Content-Disposition", "attachment; filename=dump.json.gz");
final Configuration conf = new Configuration();
conf.addResource(getClass().getResourceAsStream("/core-site.xml"));
conf.addResource(getClass().getResourceAsStream("/ocean-hadoop-conf.xml"));
final Path pathDir = new Path(opendoarEventsPath + "/" + DigestUtils.md5Hex(id));
try (final FileSystem fs = FileSystem.get(conf);
final ServletOutputStream out = res.getOutputStream();
final GZIPOutputStream gzOut = new GZIPOutputStream(out)) {
boolean first = true;
IOUtils.write("[\n", gzOut);
try {
for (final FileStatus fileStatus : fs.listStatus(pathDir)) {
if (fileStatus.isFile()) {
final Path path = fileStatus.getPath();
if (path.getName().endsWith(".json")) {
try (final FSDataInputStream fis = fs.open(path);
final InputStreamReader isr = new InputStreamReader(fis);
final BufferedReader br = new BufferedReader(isr)) {
String line = br.readLine();
while (line != null) {
if (first) {
first = false;
} else {
IOUtils.write(",\n", gzOut);
}
IOUtils.write(line, gzOut);
line = br.readLine();
}
}
}
}
}
} catch (final FileNotFoundException e) {
log.warn("File not found - " + e.getMessage());
}
IOUtils.write("\n]\n", gzOut);
gzOut.flush();
} catch (final Throwable e) {
log.error("Error accessing hdfs file", e);
throw new RuntimeException(e);
}
}
private List<ShortEventMessage> calculateNotificationMessages(final SearchScrollHits<Notification> scroll) {
if (scroll.getSearchHits().size() > 0) {
return scroll.stream()

View File

@ -19,6 +19,8 @@ spring.jpa.properties.hibernate.format_sql = false
lbs.database.url = ${spring.datasource.url}
lbs.hadoop.opendoar.events.path = /user/michele.artini/broker_events/eventsByOpendoarId
# for development server
#lbs.elastic.clusterNodes = broker1-dev-dnet.d4science.org:9300
#lbs.elastic.homepage = http://broker1-dev-dnet.d4science.org:9200/_plugin/hq

View File

@ -0,0 +1,145 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>fs.trash.interval</name>
<value>1</value>
</property>
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec</value>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>simple</value>
</property>
<property>
<name>hadoop.security.authorization</name>
<value>false</value>
</property>
<property>
<name>hadoop.rpc.protection</name>
<value>authentication</value>
</property>
<property>
<name>hadoop.security.auth_to_local</name>
<value>DEFAULT</value>
</property>
<property>
<name>hadoop.proxyuser.oozie.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.oozie.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.mapred.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.mapred.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.flume.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.flume.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.HTTP.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.HTTP.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.httpfs.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.httpfs.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hdfs.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hdfs.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.yarn.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.yarn.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.security.group.mapping</name>
<value>org.apache.hadoop.security.ShellBasedUnixGroupsMapping</value>
</property>
<property>
<name>hadoop.security.instrumentation.requires.admin</name>
<value>false</value>
</property>
<property>
<name>net.topology.script.file.name</name>
<value>/etc/hadoop/conf.cloudera.yarn2/topology.py</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>65536</value>
</property>
<property>
<name>hadoop.ssl.enabled</name>
<value>false</value>
</property>
<property>
<name>hadoop.ssl.require.client.cert</name>
<value>false</value>
<final>true</final>
</property>
<property>
<name>hadoop.ssl.keystores.factory.class</name>
<value>org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory</value>
<final>true</final>
</property>
<property>
<name>hadoop.ssl.server.conf</name>
<value>ssl-server.xml</value>
<final>true</final>
</property>
<property>
<name>hadoop.ssl.client.conf</name>
<value>ssl-client.xml</value>
<final>true</final>
</property>
</configuration>

View File

@ -0,0 +1,101 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>dfs.nameservices</name>
<value>nameservice1</value>
</property>
<property>
<name>dfs.client.failover.proxy.provider.nameservice1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<property>
<name>dfs.ha.automatic-failover.enabled.nameservice1</name>
<value>true</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>iis-cdh5-test-m1.ocean.icm.edu.pl:2181,iis-cdh5-test-m2.ocean.icm.edu.pl:2181,iis-cdh5-test-m3.ocean.icm.edu.pl:2181</value>
</property>
<property>
<name>dfs.ha.namenodes.nameservice1</name>
<value>namenode528,namenode434</value>
</property>
<property>
<name>dfs.namenode.rpc-address.nameservice1.namenode528</name>
<value>iis-cdh5-test-m1.ocean.icm.edu.pl:8020</value>
</property>
<property>
<name>dfs.namenode.servicerpc-address.nameservice1.namenode528</name>
<value>iis-cdh5-test-m1.ocean.icm.edu.pl:8022</value>
</property>
<property>
<name>dfs.namenode.http-address.nameservice1.namenode528</name>
<value>iis-cdh5-test-m1.ocean.icm.edu.pl:50070</value>
</property>
<property>
<name>dfs.namenode.https-address.nameservice1.namenode528</name>
<value>iis-cdh5-test-m1.ocean.icm.edu.pl:50470</value>
</property>
<property>
<name>dfs.namenode.rpc-address.nameservice1.namenode434</name>
<value>iis-cdh5-test-m2.ocean.icm.edu.pl:8020</value>
</property>
<property>
<name>dfs.namenode.servicerpc-address.nameservice1.namenode434</name>
<value>iis-cdh5-test-m2.ocean.icm.edu.pl:8022</value>
</property>
<property>
<name>dfs.namenode.http-address.nameservice1.namenode434</name>
<value>iis-cdh5-test-m2.ocean.icm.edu.pl:50070</value>
</property>
<property>
<name>dfs.namenode.https-address.nameservice1.namenode434</name>
<value>iis-cdh5-test-m2.ocean.icm.edu.pl:50470</value>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>false</value>
</property>
<property>
<name>fs.permissions.umask-mode</name>
<value>022</value>
</property>
<property>
<name>dfs.namenode.acls.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.client.use.legacy.blockreader</name>
<value>false</value>
</property>
<property>
<name>dfs.client.read.shortcircuit</name>
<value>false</value>
</property>
<property>
<name>dfs.domain.socket.path</name>
<value>/var/run/hdfs-sockets/dn</value>
</property>
<property>
<name>dfs.client.read.shortcircuit.skip.checksum</name>
<value>false</value>
</property>
<property>
<name>dfs.client.domain.socket.data.traffic</name>
<value>false</value>
</property>
<property>
<name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,7 @@
<!DOCTYPE html>
<html>
<head>
<title>OpenAIRE Broker Public API</title>
<meta http-equiv="refresh" content="2; url = /swagger" />
</head>
</html>