dnet-applications/apps/scholexplorer-api/src/main/java/eu/dnetlib/scholix/api/index/ScholixIndexManager.java

316 lines
12 KiB
Java

package eu.dnetlib.scholix.api.index;
import eu.dnetlib.dhp.schema.sx.scholix.Scholix;
import eu.dnetlib.scholix.api.ScholixException;
import eu.dnetlib.scholix.api.TaggedCounter;
import io.micrometer.core.annotation.Timed;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.lucene.search.join.ScoreMode;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.index.query.*;
import org.elasticsearch.search.aggregations.Aggregation;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.Aggregations;
import org.elasticsearch.search.aggregations.bucket.nested.ParsedNested;
import org.elasticsearch.search.aggregations.bucket.terms.ParsedStringTerms;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.elasticsearch.core.ElasticsearchRestTemplate;
import org.springframework.data.elasticsearch.core.SearchHit;
import org.springframework.data.elasticsearch.core.SearchHits;
import org.springframework.data.elasticsearch.core.mapping.IndexCoordinates;
import org.springframework.data.elasticsearch.core.query.NativeSearchQuery;
import org.springframework.data.elasticsearch.core.query.NativeSearchQueryBuilder;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
/**
* The type Scholix index manager.
*/
@Component
public class ScholixIndexManager {
/**
* The Elastic search properties.
*/
@Autowired
ElasticSearchProperties elasticSearchProperties;
/**
* The Elasticsearch template.
*/
@Autowired
ElasticSearchPool connectionPool;
/**
* The My counter.
*/
@Autowired
TaggedCounter myCounter;
/**
* The enum Pid type prefix.
*/
public enum RelationPrefix {
/**
* Source pid type prefix.
*/
source,
/**
* Target pid type prefix.
*/
target
}
private QueryBuilder createObjectTypeQuery(final RelationPrefix prefix, final String objectType ) throws ScholixException{
if (prefix == null){
throw new ScholixException("prefix cannot be null");
}
return new NestedQueryBuilder(String.format("%s", prefix), new TermQueryBuilder(String.format("%s.objectType",prefix), objectType), ScoreMode.None);
}
private QueryBuilder createPidTypeQuery(final RelationPrefix prefix, final String pidTypeValue ) throws ScholixException{
if (prefix == null){
throw new ScholixException("prefix cannot be null");
}
return new NestedQueryBuilder(String.format("%s.identifier", prefix), new TermQueryBuilder(String.format("%s.identifier.schema",prefix), pidTypeValue), ScoreMode.None);
}
private QueryBuilder createLinkProviderQuery(final String providerName ) throws ScholixException{
if (providerName == null){
throw new ScholixException("prefix cannot be null");
}
return new NestedQueryBuilder("linkprovider", new TermQueryBuilder("linkprovider.name",providerName), ScoreMode.None);
}
private QueryBuilder createLinkPublisherQuery(final RelationPrefix prefix, final String publisher ) throws ScholixException{
if (prefix == null){
throw new ScholixException("prefix cannot be null");
}
return new NestedQueryBuilder(String.format("%s.publisher", prefix), new TermQueryBuilder(String.format("%s.publisher.name",prefix), publisher), ScoreMode.None);
}
private QueryBuilder createPidValueQuery(final RelationPrefix prefix, final String pidValue ) throws ScholixException{
if (prefix == null){
throw new ScholixException("prefix cannot be null");
}
return new NestedQueryBuilder(String.format("%s.identifier", prefix), new TermQueryBuilder(String.format("%s.identifier.identifier",prefix), pidValue), ScoreMode.None);
}
private QueryBuilder createFinalQuery(final List<QueryBuilder> queries) throws ScholixException{
if (queries == null || queries.isEmpty())
throw new ScholixException("the list of queries must be not empty");
if (queries.size() ==1) {
return queries.get(0);
}
else {
final BoolQueryBuilder b = new BoolQueryBuilder();
b.must().addAll(queries);
return b;
}
}
private void incrementPidCounter(RelationPrefix prefix, String value) {
switch (value.toLowerCase()){
case "doi": {
myCounter.increment(String.format("%s_doi", prefix));
break;
}
case "pmc": {
myCounter.increment(String.format("%s_pmc", prefix));
break;
}
default:
myCounter.increment(String.format("%s_other", prefix));
}
}
public List<Pair<String, Long>> totalLinksByProvider(final String filterName) throws ScholixException {
final QueryBuilder query = StringUtils.isNoneBlank(filterName)?createLinkProviderQuery(filterName):QueryBuilders.matchAllQuery();
final NativeSearchQuery searchQuery = new NativeSearchQueryBuilder()
.withQuery(query)
.withSearchType(SearchType.DEFAULT)
.withPageable(PageRequest.of(0,10))
.addAggregation(AggregationBuilders.nested("nested", "linkprovider")
.subAggregation(AggregationBuilders.terms("by_map").field("linkprovider.name").size(100).minDocCount(1)))
.build();
Pair<RestHighLevelClient, ElasticsearchRestTemplate> resource = connectionPool.getResource();
ElasticsearchRestTemplate client = resource.getValue();
final SearchHits<Scholix> hits = client.search(searchQuery, Scholix.class, IndexCoordinates.of(elasticSearchProperties.getIndexName()));
final Aggregations aggregations = hits.getAggregations();
connectionPool.returnResource(resource);
if(aggregations == null)
return null;
final Aggregation aggByMap = ((ParsedNested) aggregations.asMap().get("nested")).getAggregations().asMap().get("by_map");
return ((ParsedStringTerms) aggByMap).getBuckets()
.stream()
.map(b -> new ImmutablePair<>(b.getKeyAsString(), b.getDocCount()))
.collect(Collectors.toList());
}
public List<Pair<String, Long>> totalLinksPublisher(final RelationPrefix prefix, final String filterName) throws ScholixException {
final QueryBuilder query = StringUtils.isNoneBlank(filterName)?createLinkPublisherQuery(prefix,filterName):QueryBuilders.matchAllQuery();
final NativeSearchQuery searchQuery = new NativeSearchQueryBuilder()
.withQuery(query)
.withSearchType(SearchType.DEFAULT)
.withPageable(PageRequest.of(0,10))
.addAggregation(AggregationBuilders.nested("nested", String.format("%s.publisher", prefix ))
.subAggregation(AggregationBuilders.terms("by_map").field(String.format("%s.publisher.name", prefix )).size(100).minDocCount(1)))
.build();
Pair<RestHighLevelClient, ElasticsearchRestTemplate> resource = connectionPool.getResource();
ElasticsearchRestTemplate client = resource.getValue();
final SearchHits<Scholix> hits = client.search(searchQuery, Scholix.class, IndexCoordinates.of(elasticSearchProperties.getIndexName()));
final Aggregations aggregations = hits.getAggregations();
connectionPool.returnResource(resource);
if(aggregations == null)
return null;
final Aggregation aggByMap = ((ParsedNested) aggregations.asMap().get("nested")).getAggregations().asMap().get("by_map");
return ((ParsedStringTerms) aggByMap).getBuckets()
.stream()
.map(b -> new ImmutablePair<>(b.getKeyAsString(), b.getDocCount()))
.collect(Collectors.toList());
}
/**
* Links from pid pair.
*
* @param linkProvider the link provider
* @param targetPid the target pid
* @param targetPidType the target pid type
* @param targetPublisher the target publisher
* @param targetType the target type
* @param sourcePid the source pid
* @param sourcePidType the source pid type
* @param sourcePublisher the source publisher
* @param sourceType the source type
* @param harvestedAfter the harvested after
* @param page the page
* @return the pair
* @throws ScholixException the scholix exception
*/
@Timed(value = "scholix.index.request.links", description = "Time taken to request index")
public Pair<Long,List<Scholix>> linksFromPid ( final String linkProvider,
final String targetPid, final String targetPidType, final String targetPublisher,
final String targetType, final String sourcePid, final String sourcePidType,
final String sourcePublisher, final String sourceType, final String harvestedAfter,
final Integer page) throws ScholixException {
if (sourcePid==null && sourcePidType==null && targetPid==null && targetPidType==null && sourcePublisher==null && targetPublisher==null && linkProvider==null)
throw new ScholixException("One of sourcePid, targetPid, sourcePublisher, targetPublisher, linkProvider should be not null");
final List<QueryBuilder> queries = new ArrayList<>();
if (StringUtils.isNoneBlank(linkProvider)) {
myCounter.increment("linkProvider");
queries.add(createLinkProviderQuery(linkProvider));
}
if (StringUtils.isNoneBlank(targetPid)) {
myCounter.increment("targetPid");
queries.add(createPidValueQuery(RelationPrefix.target, targetPid));
}
if (StringUtils.isNoneBlank(sourcePid)) {
myCounter.increment("sourcePid");
queries.add(createPidValueQuery(RelationPrefix.source, sourcePid));
}
if (StringUtils.isNoneBlank(targetPidType)) {
assert targetPidType != null;
incrementPidCounter(RelationPrefix.target,targetPidType);
queries.add(createPidTypeQuery(RelationPrefix.target, targetPidType));
}
if (StringUtils.isNoneBlank(sourcePidType)) {
assert sourcePidType != null;
incrementPidCounter(RelationPrefix.source,sourcePidType);
queries.add(createPidTypeQuery(RelationPrefix.source, sourcePidType));
}
if (StringUtils.isNoneBlank(targetType)) {
if ("dataset".equalsIgnoreCase(targetType) || "publication".equalsIgnoreCase(targetType))
myCounter.increment(String.format("targetType_%s", targetType));
queries.add(createObjectTypeQuery(RelationPrefix.target, targetType));
}
if (StringUtils.isNoneBlank(sourceType)) {
if ("dataset".equalsIgnoreCase(sourceType) || "publication".equalsIgnoreCase(sourceType)) {
myCounter.increment(String.format("sourceType_%s", sourceType));
}
queries.add(createObjectTypeQuery(RelationPrefix.source, sourceType));
}
if (StringUtils.isNoneBlank(targetPublisher)) {
myCounter.increment("targetPublisher");
queries.add(createLinkPublisherQuery(RelationPrefix.target,targetPublisher));
}
QueryBuilder result = createFinalQuery(queries);
NativeSearchQuery finalQuery = new NativeSearchQueryBuilder()
.withQuery(result)
.withPageable(PageRequest.of(page,10))
.build();
Pair<RestHighLevelClient, ElasticsearchRestTemplate> resource = connectionPool.getResource();
ElasticsearchRestTemplate client = resource.getValue();
long tt = client.count(finalQuery, Scholix.class, IndexCoordinates.of(elasticSearchProperties.getIndexName()));
SearchHits<Scholix> scholixRes = client.search(finalQuery, Scholix.class, IndexCoordinates.of(elasticSearchProperties.getIndexName()));
connectionPool.returnResource(resource);
return new ImmutablePair<>(tt,scholixRes.stream().map(SearchHit::getContent).collect(Collectors.toList()));
}
}