2022-10-18 14:00:26 +02:00
package eu.openaire.urls_controller.controllers ;
import eu.openaire.urls_controller.configuration.ImpalaConnector ;
2023-02-09 18:25:48 +01:00
import eu.openaire.urls_controller.services.StatsService ;
2022-10-18 14:00:26 +02:00
import org.slf4j.Logger ;
import org.slf4j.LoggerFactory ;
import org.springframework.beans.factory.annotation.Autowired ;
import org.springframework.http.ResponseEntity ;
import org.springframework.web.bind.annotation.GetMapping ;
import org.springframework.web.bind.annotation.RequestMapping ;
2023-04-24 08:54:35 +02:00
import org.springframework.web.bind.annotation.RequestParam ;
2022-10-18 14:00:26 +02:00
import org.springframework.web.bind.annotation.RestController ;
/ * *
2023-02-02 16:58:47 +01:00
* This controller returns statistics for the database .
2022-10-18 14:00:26 +02:00
* /
@RestController
@RequestMapping ( " /stats " )
public class StatsController {
private static final Logger logger = LoggerFactory . getLogger ( StatsController . class ) ;
@Autowired
2023-02-09 18:25:48 +01:00
private StatsService statsService ;
2022-10-18 14:00:26 +02:00
2023-02-02 16:58:47 +01:00
/ * *
* This endpoint returns the total number of payloads existing in the database , independently of the way they were aggregated .
* This includes the payloads created by other pieces of software , before the PDF - Aggregation - Service was created .
* * /
@GetMapping ( " getNumberOfAllPayloads " )
public ResponseEntity < ? > getNumberOfAllPayloads ( ) {
logger . info ( " Received a \" getNumberOfAllPayloads \" request. " ) ;
2022-12-15 13:04:22 +01:00
final String getPayloadsNumberQuery = " select count(id) from " + ImpalaConnector . databaseName + " .payload " ;
2023-05-04 14:48:49 +02:00
return statsService . getNumberOfPayloads ( getPayloadsNumberQuery , " payloads " ) ;
2023-02-02 16:58:47 +01:00
}
/ * *
* This endpoint returns the number of payloads aggregated by the PDF - Aggregated - Service itself .
* It excludes the payloads aggregated by other methods , by applying a Date - filter for the records created in 2021 or later .
* * /
@GetMapping ( " getNumberOfPayloadsAggregatedByService " )
public ResponseEntity < ? > getNumberOfPayloadsAggregatedByService ( ) {
logger . info ( " Received a \" getNumberOfPayloadsAggregatedByService \" request. " ) ;
String getPayloadsAggregatedQuery = " select count(id) from " + ImpalaConnector . databaseName + " .payload where `date` >= cast(cast('2021-01-01' as timestamp) as bigint) " ;
2023-05-04 14:48:49 +02:00
return statsService . getNumberOfPayloads ( getPayloadsAggregatedQuery , " payloads retrieved by the PDF Aggregation Service " ) ;
2022-10-18 14:00:26 +02:00
}
2023-04-24 08:54:35 +02:00
/ * *
* This endpoint returns the number of payloads related to the given datasourceID .
* * /
@GetMapping ( " getNumberOfPayloadsForDatasource " )
public ResponseEntity < ? > getNumberOfPayloadsForDatasource ( @RequestParam String datasourceId ) {
logger . info ( " Received a \" getNumberOfPayloadsForDatasource \" request. " ) ;
final String getPayloadsNumberForDatasourceQuery =
" select count(p.id) from " + ImpalaConnector . databaseName + " .payload p \ n " +
" join " + ImpalaConnector . databaseName + " .publication pu on pu.id=p.id and pu.datasourceid= \" " + datasourceId + " \" " ;
2023-05-15 11:44:16 +02:00
if ( logger . isTraceEnabled ( ) )
logger . trace ( " getPayloadsNumberForDatasourceQuery: \ n " + getPayloadsNumberForDatasourceQuery ) ;
2023-05-04 14:48:49 +02:00
return statsService . getNumberOfPayloads ( getPayloadsNumberForDatasourceQuery , " payloads related to datasourceId \" " + datasourceId + " \" " ) ;
}
// TODO - Add an endpoint to get the publication year as a param and return the number of payloads for the publications of that year.
// select count(p.id) from payload p
2023-06-15 22:19:36 +02:00
// join publication pu on pu.id=p.id and pu.year=<GIVEN_YEAR>
2023-05-04 14:48:49 +02:00
2023-04-24 08:54:35 +02:00
2023-05-04 14:48:49 +02:00
// TODO - Add an endpoint to return the info of all datasources in the database with the count of their payloads (including 0).
// Maybe have a param "numTopDatasources" which will work a a "limit" in the following query.
// In case the "numTopDatasources" param is not given or is less or equal to 0, then no limit will be added to the query.
/ *
select d . id , d . name , d . type , d . allow_harvest , count ( p . id ) as payload_count from datasource d
join publication pu on pu . datasourceid = d . id
2023-06-15 22:19:36 +02:00
left join payload p on p . id = pu . id - - We want the datasources with 0 payloads too , so we use " left join " .
2023-05-04 14:48:49 +02:00
group by d . id , d . name , d . type , d . allow_harvest
order by payload_count desc
* /
/ * *
* This endpoint returns the total number of distinct full - text files existing in the database .
* * /
@GetMapping ( " getNumberOfAllDistinctFullTexts " )
public ResponseEntity < ? > getNumberOfAllDistinctFullTexts ( ) {
logger . info ( " Received a \" getNumberOfAllDistinctFullTexts \" request. " ) ;
final String getPayloadsNumberQuery = " select count(distinct `hash`) from " + ImpalaConnector . databaseName + " .payload " ;
return statsService . getNumberOfPayloads ( getPayloadsNumberQuery , " distinct full-text files " ) ;
2023-04-24 08:54:35 +02:00
}
2023-02-02 16:58:47 +01:00
/ * *
* This endpoint returns the number of records inspected by the PDF - Aggregation - Service .
* * /
2022-10-18 14:00:26 +02:00
@GetMapping ( " getNumberOfRecordsInspected " )
public ResponseEntity < ? > getNumberOfRecordsInspected ( )
{
2023-02-09 18:25:48 +01:00
return statsService . getNumberOfRecordsInspected ( ) ;
2023-02-02 16:58:47 +01:00
}
2022-10-18 14:00:26 +02:00
}