2023-05-25 12:17:46 +02:00
|
|
|
package eu.dnetlib.apps.oai2ftp.service;
|
|
|
|
|
2023-05-25 15:25:09 +02:00
|
|
|
import java.time.Duration;
|
|
|
|
import java.time.LocalDateTime;
|
2023-05-25 12:17:46 +02:00
|
|
|
import java.util.LinkedHashMap;
|
2023-05-26 11:22:15 +02:00
|
|
|
import java.util.List;
|
2023-05-25 12:17:46 +02:00
|
|
|
import java.util.Map;
|
2023-05-25 15:25:09 +02:00
|
|
|
import java.util.Set;
|
2023-05-25 12:17:46 +02:00
|
|
|
import java.util.concurrent.ExecutorService;
|
|
|
|
import java.util.concurrent.Executors;
|
2023-05-25 15:25:09 +02:00
|
|
|
import java.util.concurrent.TimeUnit;
|
|
|
|
import java.util.stream.Collectors;
|
2023-05-25 12:17:46 +02:00
|
|
|
|
2023-05-25 15:25:09 +02:00
|
|
|
import org.apache.commons.lang3.StringUtils;
|
2023-05-26 12:03:27 +02:00
|
|
|
import org.apache.commons.lang3.exception.ExceptionUtils;
|
2023-05-25 12:17:46 +02:00
|
|
|
import org.apache.commons.logging.Log;
|
|
|
|
import org.apache.commons.logging.LogFactory;
|
2023-05-26 11:22:15 +02:00
|
|
|
import org.dom4j.Document;
|
|
|
|
import org.dom4j.DocumentHelper;
|
|
|
|
import org.dom4j.Node;
|
2023-05-25 12:17:46 +02:00
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
2023-05-25 15:25:09 +02:00
|
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
|
|
import org.springframework.scheduling.annotation.Scheduled;
|
2023-05-25 12:17:46 +02:00
|
|
|
import org.springframework.stereotype.Service;
|
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
import eu.dnetlib.apps.oai2ftp.model.CollectionCall;
|
|
|
|
import eu.dnetlib.apps.oai2ftp.model.CollectionInfo;
|
2023-05-25 15:25:09 +02:00
|
|
|
import eu.dnetlib.apps.oai2ftp.model.ExecutionStatus;
|
2023-05-25 12:17:46 +02:00
|
|
|
import eu.dnetlib.apps.oai2ftp.repository.CollectionLogEntryRepository;
|
2023-05-26 11:22:15 +02:00
|
|
|
import eu.dnetlib.apps.oai2ftp.utils.HttpFetcher;
|
|
|
|
import eu.dnetlib.apps.oai2ftp.utils.SimpleUtils;
|
2023-05-26 13:52:31 +02:00
|
|
|
import eu.dnetlib.apps.oai2ftp.utils.StorageClient;
|
|
|
|
import eu.dnetlib.apps.oai2ftp.utils.StorageClientFactory;
|
2023-05-25 12:17:46 +02:00
|
|
|
|
|
|
|
@Service
|
|
|
|
public class Oai2FtpService {
|
|
|
|
|
|
|
|
private static final Log log = LogFactory.getLog(Oai2FtpService.class);
|
|
|
|
|
|
|
|
private final ExecutorService jobExecutor = Executors.newFixedThreadPool(100);
|
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
private final Map<String, CollectionInfo> infoMap = new LinkedHashMap<>();
|
2023-05-25 12:17:46 +02:00
|
|
|
|
2023-05-25 14:44:19 +02:00
|
|
|
@Autowired
|
2023-05-26 13:52:31 +02:00
|
|
|
private StorageClientFactory storageClientFactory;
|
2023-05-25 12:17:46 +02:00
|
|
|
|
2023-05-25 15:25:09 +02:00
|
|
|
@Value("${oai2ftp.conf.execution.expirationTime}")
|
2023-05-26 11:22:15 +02:00
|
|
|
private long fullInfoExpirationTime; // in hours
|
2023-05-25 15:25:09 +02:00
|
|
|
|
2023-05-25 12:17:46 +02:00
|
|
|
@Autowired
|
|
|
|
private CollectionLogEntryRepository collectionLogEntryRepository;
|
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
public CollectionInfo startCollection(final String baseUrl,
|
|
|
|
final String format,
|
|
|
|
final String setSpec,
|
|
|
|
final LocalDateTime from,
|
|
|
|
final LocalDateTime until) {
|
|
|
|
final String jobId = SimpleUtils.generateNewJobId();
|
2023-05-25 12:17:46 +02:00
|
|
|
|
2023-05-26 13:52:31 +02:00
|
|
|
final StorageClient sc = storageClientFactory.newClientForJob(jobId);
|
2023-05-25 12:17:46 +02:00
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
final CollectionInfo info = new CollectionInfo();
|
|
|
|
info.setId(jobId);
|
|
|
|
|
|
|
|
info.setOaiBaseUrl(baseUrl);
|
|
|
|
info.setOaiFormat(format);
|
|
|
|
info.setOaiSet(setSpec);
|
|
|
|
info.setOaiFrom(from);
|
|
|
|
info.setOaiUntil(until);
|
|
|
|
|
|
|
|
info.setStart(LocalDateTime.now());
|
|
|
|
info.setEnd(null);
|
|
|
|
|
|
|
|
info.setExecutionStatus(ExecutionStatus.READY);
|
|
|
|
info.setTotal(0);
|
|
|
|
info.setMessage("");
|
|
|
|
|
|
|
|
infoMap.put(jobId, info);
|
|
|
|
|
|
|
|
jobExecutor.execute(() -> {
|
|
|
|
try {
|
|
|
|
info.setExecutionStatus(ExecutionStatus.RUNNING);
|
2023-05-26 13:52:31 +02:00
|
|
|
oaiCollect(baseUrl, format, setSpec, from, until, sc, info);
|
2023-05-26 11:22:15 +02:00
|
|
|
info.setExecutionStatus(ExecutionStatus.COMPLETED);
|
|
|
|
} catch (final Throwable e) {
|
|
|
|
info.setExecutionStatus(ExecutionStatus.FAILED);
|
2023-05-26 12:03:27 +02:00
|
|
|
info.setMessage(e.getMessage() + ": " + ExceptionUtils.getStackTrace(e));
|
2023-05-26 11:22:15 +02:00
|
|
|
} finally {
|
2023-05-26 12:03:27 +02:00
|
|
|
info.setEnd(LocalDateTime.now());
|
2023-05-26 13:52:31 +02:00
|
|
|
sc.disconnect();
|
2023-05-26 11:22:15 +02:00
|
|
|
collectionLogEntryRepository.save(SimpleUtils.infoToLog(info));
|
|
|
|
}
|
|
|
|
});
|
2023-05-25 12:17:46 +02:00
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
return info;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void oaiCollect(final String baseUrl,
|
|
|
|
final String format,
|
|
|
|
final String setSpec,
|
|
|
|
final LocalDateTime from,
|
|
|
|
final LocalDateTime until,
|
2023-05-26 13:52:31 +02:00
|
|
|
final StorageClient sc,
|
2023-05-26 11:22:15 +02:00
|
|
|
final CollectionInfo info)
|
|
|
|
throws Exception {
|
|
|
|
|
|
|
|
String url = SimpleUtils.oaiFirstUrl(baseUrl, format, setSpec, from, until);
|
|
|
|
|
|
|
|
while (StringUtils.isNotBlank(url)) {
|
|
|
|
final CollectionCall call = new CollectionCall();
|
|
|
|
call.setUrl(url);
|
|
|
|
info.getCalls().add(call);
|
2023-05-25 12:17:46 +02:00
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
final String xml = HttpFetcher.download(call);
|
|
|
|
final Document doc = DocumentHelper.parseText(xml);
|
2023-05-25 12:17:46 +02:00
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
final List<Node> records = doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']");
|
|
|
|
call.setNumberOfRecords(records.size());
|
2023-05-25 12:17:46 +02:00
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
for (final Node n : records) {
|
2023-05-26 13:52:31 +02:00
|
|
|
final String id = n.valueOf(".//*[local-name()='header']/*[local-name()='identifier']");
|
|
|
|
|
|
|
|
sc.saveFile(SimpleUtils.oaiIdToFilename(id), n.asXML());
|
2023-05-26 11:22:15 +02:00
|
|
|
info.setTotal(info.getTotal() + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
final String rtoken = doc.valueOf("//*[local-name()='resumptionToken']").trim();
|
|
|
|
|
|
|
|
url = SimpleUtils.oaiNextUrl(baseUrl, rtoken);
|
|
|
|
}
|
2023-05-25 12:17:46 +02:00
|
|
|
}
|
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
public CollectionInfo getStatus(final String jobId) {
|
|
|
|
final CollectionInfo info = infoMap.get(jobId);
|
|
|
|
if (info != null) {
|
|
|
|
return info;
|
2023-05-25 12:17:46 +02:00
|
|
|
} else {
|
|
|
|
return collectionLogEntryRepository.findById(jobId)
|
2023-05-26 11:22:15 +02:00
|
|
|
.map(SimpleUtils::logToInfo)
|
2023-05-26 12:03:27 +02:00
|
|
|
.orElseThrow(() -> new RuntimeException("Invalid id: " + jobId));
|
2023-05-25 12:17:46 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-25 15:25:09 +02:00
|
|
|
@Scheduled(fixedRate = 30, timeUnit = TimeUnit.MINUTES)
|
|
|
|
public void cronCleanJobs() throws Exception {
|
2023-05-26 11:22:15 +02:00
|
|
|
final Set<String> toDelete = infoMap.entrySet()
|
2023-05-25 15:25:09 +02:00
|
|
|
.stream()
|
|
|
|
.filter(e -> {
|
2023-05-26 11:22:15 +02:00
|
|
|
final ExecutionStatus status = e.getValue().getExecutionStatus();
|
2023-05-25 15:25:09 +02:00
|
|
|
return status == ExecutionStatus.COMPLETED || status == ExecutionStatus.FAILED;
|
|
|
|
})
|
|
|
|
.filter(e -> {
|
2023-05-26 11:22:15 +02:00
|
|
|
final LocalDateTime end = e.getValue().getEnd();
|
2023-05-25 15:25:09 +02:00
|
|
|
final long hours = Duration.between(end, LocalDateTime.now()).toHours();
|
2023-05-26 11:22:15 +02:00
|
|
|
return Math.abs(hours) > fullInfoExpirationTime;
|
2023-05-25 15:25:09 +02:00
|
|
|
})
|
|
|
|
.map(e -> e.getKey())
|
|
|
|
.collect(Collectors.toSet());
|
|
|
|
|
|
|
|
log.info("Cleaning expired jobs: " + StringUtils.join(toDelete, ", "));
|
|
|
|
|
2023-05-26 11:22:15 +02:00
|
|
|
toDelete.forEach(infoMap::remove);
|
2023-05-25 15:25:09 +02:00
|
|
|
}
|
|
|
|
|
2023-05-25 12:17:46 +02:00
|
|
|
}
|