2022-02-07 16:36:26 +01:00
|
|
|
package eu.dnetlib.scholix.api.index;
|
|
|
|
|
|
|
|
|
|
|
|
import eu.dnetlib.dhp.schema.sx.scholix.Scholix;
|
|
|
|
import eu.dnetlib.scholix.api.ScholixException;
|
|
|
|
import eu.dnetlib.scholix.api.TaggedCounter;
|
|
|
|
import io.micrometer.core.annotation.Timed;
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
import org.apache.commons.lang3.tuple.ImmutablePair;
|
|
|
|
import org.apache.commons.lang3.tuple.Pair;
|
|
|
|
import org.apache.lucene.search.join.ScoreMode;
|
2022-02-09 11:33:09 +01:00
|
|
|
import org.elasticsearch.action.search.SearchType;
|
2022-02-08 09:57:45 +01:00
|
|
|
import org.elasticsearch.client.RestHighLevelClient;
|
2022-02-09 11:33:09 +01:00
|
|
|
import org.elasticsearch.index.query.*;
|
|
|
|
import org.elasticsearch.search.aggregations.Aggregation;
|
|
|
|
import org.elasticsearch.search.aggregations.AggregationBuilders;
|
|
|
|
import org.elasticsearch.search.aggregations.Aggregations;
|
|
|
|
import org.elasticsearch.search.aggregations.bucket.nested.ParsedNested;
|
|
|
|
import org.elasticsearch.search.aggregations.bucket.terms.ParsedStringTerms;
|
2022-02-07 16:36:26 +01:00
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
|
import org.springframework.data.domain.PageRequest;
|
2022-02-08 09:57:45 +01:00
|
|
|
import org.springframework.data.elasticsearch.core.ElasticsearchRestTemplate;
|
2022-02-07 16:36:26 +01:00
|
|
|
import org.springframework.data.elasticsearch.core.SearchHit;
|
|
|
|
import org.springframework.data.elasticsearch.core.SearchHits;
|
|
|
|
import org.springframework.data.elasticsearch.core.mapping.IndexCoordinates;
|
|
|
|
import org.springframework.data.elasticsearch.core.query.NativeSearchQuery;
|
|
|
|
import org.springframework.data.elasticsearch.core.query.NativeSearchQueryBuilder;
|
|
|
|
import org.springframework.stereotype.Component;
|
|
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The type Scholix index manager.
|
|
|
|
*/
|
|
|
|
@Component
|
|
|
|
public class ScholixIndexManager {
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The Elastic search properties.
|
|
|
|
*/
|
|
|
|
@Autowired
|
|
|
|
ElasticSearchProperties elasticSearchProperties;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The Elasticsearch template.
|
|
|
|
*/
|
|
|
|
@Autowired
|
2022-02-08 09:57:45 +01:00
|
|
|
ElasticSearchPool connectionPool;
|
2022-02-07 16:36:26 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* The My counter.
|
|
|
|
*/
|
|
|
|
@Autowired
|
|
|
|
TaggedCounter myCounter;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The enum Pid type prefix.
|
|
|
|
*/
|
2022-02-14 09:16:37 +01:00
|
|
|
public enum RelationPrefix {
|
2022-02-07 16:36:26 +01:00
|
|
|
/**
|
|
|
|
* Source pid type prefix.
|
|
|
|
*/
|
|
|
|
source,
|
|
|
|
/**
|
|
|
|
* Target pid type prefix.
|
|
|
|
*/
|
|
|
|
target
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-02-14 09:16:37 +01:00
|
|
|
private QueryBuilder createObjectTypeQuery(final RelationPrefix prefix, final String objectType ) throws ScholixException{
|
2022-02-07 16:36:26 +01:00
|
|
|
if (prefix == null){
|
|
|
|
throw new ScholixException("prefix cannot be null");
|
|
|
|
}
|
|
|
|
return new NestedQueryBuilder(String.format("%s", prefix), new TermQueryBuilder(String.format("%s.objectType",prefix), objectType), ScoreMode.None);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-02-14 09:16:37 +01:00
|
|
|
private QueryBuilder createPidTypeQuery(final RelationPrefix prefix, final String pidTypeValue ) throws ScholixException{
|
2022-02-07 16:36:26 +01:00
|
|
|
if (prefix == null){
|
|
|
|
throw new ScholixException("prefix cannot be null");
|
|
|
|
}
|
|
|
|
return new NestedQueryBuilder(String.format("%s.identifier", prefix), new TermQueryBuilder(String.format("%s.identifier.schema",prefix), pidTypeValue), ScoreMode.None);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-02-14 09:16:37 +01:00
|
|
|
private QueryBuilder createLinkProviderQuery(final String providerName ) throws ScholixException{
|
|
|
|
if (providerName == null){
|
|
|
|
throw new ScholixException("prefix cannot be null");
|
|
|
|
}
|
|
|
|
return new NestedQueryBuilder("linkprovider", new TermQueryBuilder("linkprovider.name",providerName), ScoreMode.None);
|
|
|
|
}
|
|
|
|
|
|
|
|
private QueryBuilder createLinkPublisherQuery(final RelationPrefix prefix, final String publisher ) throws ScholixException{
|
|
|
|
if (prefix == null){
|
|
|
|
throw new ScholixException("prefix cannot be null");
|
|
|
|
}
|
|
|
|
return new NestedQueryBuilder(String.format("%s.publisher", prefix), new TermQueryBuilder(String.format("%s.publisher.name",prefix), publisher), ScoreMode.None);
|
|
|
|
}
|
|
|
|
|
|
|
|
private QueryBuilder createPidValueQuery(final RelationPrefix prefix, final String pidValue ) throws ScholixException{
|
2022-02-07 16:36:26 +01:00
|
|
|
if (prefix == null){
|
|
|
|
throw new ScholixException("prefix cannot be null");
|
|
|
|
}
|
|
|
|
return new NestedQueryBuilder(String.format("%s.identifier", prefix), new TermQueryBuilder(String.format("%s.identifier.identifier",prefix), pidValue), ScoreMode.None);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private QueryBuilder createFinalQuery(final List<QueryBuilder> queries) throws ScholixException{
|
|
|
|
|
|
|
|
if (queries == null || queries.isEmpty())
|
|
|
|
throw new ScholixException("the list of queries must be not empty");
|
|
|
|
|
|
|
|
|
|
|
|
if (queries.size() ==1) {
|
|
|
|
return queries.get(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
else {
|
|
|
|
final BoolQueryBuilder b = new BoolQueryBuilder();
|
|
|
|
b.must().addAll(queries);
|
|
|
|
|
|
|
|
return b;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-02-14 09:16:37 +01:00
|
|
|
private void incrementPidCounter(RelationPrefix prefix, String value) {
|
2022-02-07 16:36:26 +01:00
|
|
|
switch (value.toLowerCase()){
|
|
|
|
case "doi": {
|
|
|
|
myCounter.increment(String.format("%s_doi", prefix));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case "pmc": {
|
|
|
|
myCounter.increment(String.format("%s_pmc", prefix));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
myCounter.increment(String.format("%s_other", prefix));
|
|
|
|
}
|
2022-02-09 11:33:09 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-02-14 09:16:37 +01:00
|
|
|
public List<Pair<String, Long>> totalLinksByProvider(final String filterName) throws ScholixException {
|
|
|
|
|
|
|
|
|
|
|
|
final QueryBuilder query = StringUtils.isNoneBlank(filterName)?createLinkProviderQuery(filterName):QueryBuilders.matchAllQuery();
|
2022-02-09 11:33:09 +01:00
|
|
|
|
|
|
|
final NativeSearchQuery searchQuery = new NativeSearchQueryBuilder()
|
2022-02-14 09:16:37 +01:00
|
|
|
.withQuery(query)
|
2022-02-09 11:33:09 +01:00
|
|
|
.withSearchType(SearchType.DEFAULT)
|
|
|
|
.withPageable(PageRequest.of(0,10))
|
|
|
|
.addAggregation(AggregationBuilders.nested("nested", "linkprovider")
|
|
|
|
.subAggregation(AggregationBuilders.terms("by_map").field("linkprovider.name").size(100).minDocCount(1)))
|
|
|
|
.build();
|
|
|
|
|
|
|
|
|
|
|
|
Pair<RestHighLevelClient, ElasticsearchRestTemplate> resource = connectionPool.getResource();
|
|
|
|
ElasticsearchRestTemplate client = resource.getValue();
|
|
|
|
final SearchHits<Scholix> hits = client.search(searchQuery, Scholix.class, IndexCoordinates.of(elasticSearchProperties.getIndexName()));
|
|
|
|
|
|
|
|
final Aggregations aggregations = hits.getAggregations();
|
2022-02-14 09:16:37 +01:00
|
|
|
connectionPool.returnResource(resource);
|
|
|
|
|
|
|
|
if(aggregations == null)
|
|
|
|
return null;
|
|
|
|
|
|
|
|
final Aggregation aggByMap = ((ParsedNested) aggregations.asMap().get("nested")).getAggregations().asMap().get("by_map");
|
|
|
|
|
|
|
|
|
|
|
|
return ((ParsedStringTerms) aggByMap).getBuckets()
|
|
|
|
.stream()
|
|
|
|
.map(b -> new ImmutablePair<>(b.getKeyAsString(), b.getDocCount()))
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
}
|
|
|
|
|
2022-02-07 16:36:26 +01:00
|
|
|
|
2022-02-14 09:16:37 +01:00
|
|
|
public List<Pair<String, Long>> totalLinksPublisher(final RelationPrefix prefix, final String filterName) throws ScholixException {
|
|
|
|
|
|
|
|
|
|
|
|
final QueryBuilder query = StringUtils.isNoneBlank(filterName)?createLinkPublisherQuery(prefix,filterName):QueryBuilders.matchAllQuery();
|
|
|
|
|
|
|
|
final NativeSearchQuery searchQuery = new NativeSearchQueryBuilder()
|
|
|
|
.withQuery(query)
|
|
|
|
.withSearchType(SearchType.DEFAULT)
|
|
|
|
.withPageable(PageRequest.of(0,10))
|
|
|
|
.addAggregation(AggregationBuilders.nested("nested", String.format("%s.publisher", prefix ))
|
|
|
|
.subAggregation(AggregationBuilders.terms("by_map").field(String.format("%s.publisher.name", prefix )).size(100).minDocCount(1)))
|
|
|
|
.build();
|
|
|
|
|
|
|
|
|
|
|
|
Pair<RestHighLevelClient, ElasticsearchRestTemplate> resource = connectionPool.getResource();
|
|
|
|
ElasticsearchRestTemplate client = resource.getValue();
|
|
|
|
final SearchHits<Scholix> hits = client.search(searchQuery, Scholix.class, IndexCoordinates.of(elasticSearchProperties.getIndexName()));
|
|
|
|
|
|
|
|
final Aggregations aggregations = hits.getAggregations();
|
|
|
|
connectionPool.returnResource(resource);
|
2022-02-09 11:33:09 +01:00
|
|
|
|
|
|
|
if(aggregations == null)
|
|
|
|
return null;
|
|
|
|
|
|
|
|
final Aggregation aggByMap = ((ParsedNested) aggregations.asMap().get("nested")).getAggregations().asMap().get("by_map");
|
|
|
|
|
|
|
|
|
|
|
|
return ((ParsedStringTerms) aggByMap).getBuckets()
|
|
|
|
.stream()
|
|
|
|
.map(b -> new ImmutablePair<>(b.getKeyAsString(), b.getDocCount()))
|
|
|
|
.collect(Collectors.toList());
|
2022-02-07 16:36:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-02-09 11:33:09 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2022-02-07 16:36:26 +01:00
|
|
|
/**
|
|
|
|
* Links from pid pair.
|
|
|
|
*
|
|
|
|
* @param linkProvider the link provider
|
|
|
|
* @param targetPid the target pid
|
|
|
|
* @param targetPidType the target pid type
|
|
|
|
* @param targetPublisher the target publisher
|
|
|
|
* @param targetType the target type
|
|
|
|
* @param sourcePid the source pid
|
|
|
|
* @param sourcePidType the source pid type
|
|
|
|
* @param sourcePublisher the source publisher
|
|
|
|
* @param sourceType the source type
|
|
|
|
* @param harvestedAfter the harvested after
|
|
|
|
* @param page the page
|
|
|
|
* @return the pair
|
|
|
|
* @throws ScholixException the scholix exception
|
|
|
|
*/
|
|
|
|
@Timed(value = "scholix.index.request.links", description = "Time taken to request index")
|
2022-02-14 09:16:37 +01:00
|
|
|
public Pair<Long,List<Scholix>> linksFromPid ( final String linkProvider,
|
2022-02-07 16:36:26 +01:00
|
|
|
final String targetPid, final String targetPidType, final String targetPublisher,
|
|
|
|
final String targetType, final String sourcePid, final String sourcePidType,
|
|
|
|
final String sourcePublisher, final String sourceType, final String harvestedAfter,
|
|
|
|
final Integer page) throws ScholixException {
|
|
|
|
|
2022-02-14 09:16:37 +01:00
|
|
|
|
2022-02-07 16:36:26 +01:00
|
|
|
|
|
|
|
if (sourcePid==null && sourcePidType==null && targetPid==null && targetPidType==null && sourcePublisher==null && targetPublisher==null && linkProvider==null)
|
|
|
|
throw new ScholixException("One of sourcePid, targetPid, sourcePublisher, targetPublisher, linkProvider should be not null");
|
|
|
|
|
|
|
|
final List<QueryBuilder> queries = new ArrayList<>();
|
|
|
|
|
2022-02-14 09:16:37 +01:00
|
|
|
if (StringUtils.isNoneBlank(linkProvider)) {
|
|
|
|
myCounter.increment("linkProvider");
|
|
|
|
queries.add(createLinkProviderQuery(linkProvider));
|
|
|
|
}
|
|
|
|
|
2022-02-07 16:36:26 +01:00
|
|
|
if (StringUtils.isNoneBlank(targetPid)) {
|
|
|
|
myCounter.increment("targetPid");
|
2022-02-14 09:16:37 +01:00
|
|
|
queries.add(createPidValueQuery(RelationPrefix.target, targetPid));
|
2022-02-07 16:36:26 +01:00
|
|
|
}
|
|
|
|
if (StringUtils.isNoneBlank(sourcePid)) {
|
|
|
|
myCounter.increment("sourcePid");
|
2022-02-14 09:16:37 +01:00
|
|
|
queries.add(createPidValueQuery(RelationPrefix.source, sourcePid));
|
2022-02-07 16:36:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (StringUtils.isNoneBlank(targetPidType)) {
|
|
|
|
assert targetPidType != null;
|
2022-02-14 09:16:37 +01:00
|
|
|
incrementPidCounter(RelationPrefix.target,targetPidType);
|
|
|
|
queries.add(createPidTypeQuery(RelationPrefix.target, targetPidType));
|
2022-02-07 16:36:26 +01:00
|
|
|
}
|
|
|
|
if (StringUtils.isNoneBlank(sourcePidType)) {
|
|
|
|
assert sourcePidType != null;
|
2022-02-14 09:16:37 +01:00
|
|
|
incrementPidCounter(RelationPrefix.source,sourcePidType);
|
|
|
|
queries.add(createPidTypeQuery(RelationPrefix.source, sourcePidType));
|
2022-02-07 16:36:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (StringUtils.isNoneBlank(targetType)) {
|
|
|
|
if ("dataset".equalsIgnoreCase(targetType) || "publication".equalsIgnoreCase(targetType))
|
|
|
|
myCounter.increment(String.format("targetType_%s", targetType));
|
2022-02-14 09:16:37 +01:00
|
|
|
queries.add(createObjectTypeQuery(RelationPrefix.target, targetType));
|
2022-02-07 16:36:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (StringUtils.isNoneBlank(sourceType)) {
|
|
|
|
if ("dataset".equalsIgnoreCase(sourceType) || "publication".equalsIgnoreCase(sourceType)) {
|
|
|
|
myCounter.increment(String.format("sourceType_%s", sourceType));
|
|
|
|
}
|
2022-02-14 09:16:37 +01:00
|
|
|
queries.add(createObjectTypeQuery(RelationPrefix.source, sourceType));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (StringUtils.isNoneBlank(targetPublisher)) {
|
|
|
|
myCounter.increment("targetPublisher");
|
|
|
|
|
|
|
|
queries.add(createLinkPublisherQuery(RelationPrefix.target,targetPublisher));
|
2022-02-07 16:36:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
QueryBuilder result = createFinalQuery(queries);
|
|
|
|
|
|
|
|
NativeSearchQuery finalQuery = new NativeSearchQueryBuilder()
|
|
|
|
.withQuery(result)
|
|
|
|
.withPageable(PageRequest.of(page,10))
|
|
|
|
.build();
|
|
|
|
|
|
|
|
|
2022-02-09 11:33:09 +01:00
|
|
|
Pair<RestHighLevelClient, ElasticsearchRestTemplate> resource = connectionPool.getResource();
|
|
|
|
ElasticsearchRestTemplate client = resource.getValue();
|
|
|
|
|
|
|
|
long tt = client.count(finalQuery, Scholix.class, IndexCoordinates.of(elasticSearchProperties.getIndexName()));
|
|
|
|
|
|
|
|
SearchHits<Scholix> scholixRes = client.search(finalQuery, Scholix.class, IndexCoordinates.of(elasticSearchProperties.getIndexName()));
|
2022-02-07 16:36:26 +01:00
|
|
|
|
2022-02-09 11:33:09 +01:00
|
|
|
connectionPool.returnResource(resource);
|
2022-02-07 16:36:26 +01:00
|
|
|
|
|
|
|
return new ImmutablePair<>(tt,scholixRes.stream().map(SearchHit::getContent).collect(Collectors.toList()));
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|