simpleOaiCollectorService/src/main/java/eu/dnetlib/apps/Oai2ftp/service/Oai2FtpService.java

166 lines
5.1 KiB
Java
Raw Normal View History

package eu.dnetlib.apps.oai2ftp.service;
2023-05-25 15:25:09 +02:00
import java.time.Duration;
import java.time.LocalDateTime;
import java.util.LinkedHashMap;
2023-05-26 11:22:15 +02:00
import java.util.List;
import java.util.Map;
2023-05-25 15:25:09 +02:00
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
2023-05-25 15:25:09 +02:00
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
2023-05-25 15:25:09 +02:00
import org.apache.commons.lang3.StringUtils;
2023-05-26 12:03:27 +02:00
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
2023-05-26 11:22:15 +02:00
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import org.springframework.beans.factory.annotation.Autowired;
2023-05-25 15:25:09 +02:00
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
2023-05-26 11:22:15 +02:00
import eu.dnetlib.apps.oai2ftp.model.CollectionCall;
import eu.dnetlib.apps.oai2ftp.model.CollectionInfo;
2023-05-25 15:25:09 +02:00
import eu.dnetlib.apps.oai2ftp.model.ExecutionStatus;
import eu.dnetlib.apps.oai2ftp.repository.CollectionLogEntryRepository;
2023-05-26 11:22:15 +02:00
import eu.dnetlib.apps.oai2ftp.utils.HttpFetcher;
import eu.dnetlib.apps.oai2ftp.utils.SimpleUtils;
2023-05-26 13:52:31 +02:00
import eu.dnetlib.apps.oai2ftp.utils.StorageClient;
import eu.dnetlib.apps.oai2ftp.utils.StorageClientFactory;
@Service
public class Oai2FtpService {
private static final Log log = LogFactory.getLog(Oai2FtpService.class);
private final ExecutorService jobExecutor = Executors.newFixedThreadPool(100);
2023-05-26 11:22:15 +02:00
private final Map<String, CollectionInfo> infoMap = new LinkedHashMap<>();
2023-05-25 14:44:19 +02:00
@Autowired
2023-05-26 13:52:31 +02:00
private StorageClientFactory storageClientFactory;
2023-05-25 15:25:09 +02:00
@Value("${oai2ftp.conf.execution.expirationTime}")
2023-05-26 11:22:15 +02:00
private long fullInfoExpirationTime; // in hours
2023-05-25 15:25:09 +02:00
@Autowired
private CollectionLogEntryRepository collectionLogEntryRepository;
2023-05-26 11:22:15 +02:00
public CollectionInfo startCollection(final String baseUrl,
final String format,
final String setSpec,
final LocalDateTime from,
final LocalDateTime until) {
final String jobId = SimpleUtils.generateNewJobId();
2023-05-26 13:52:31 +02:00
final StorageClient sc = storageClientFactory.newClientForJob(jobId);
2023-05-26 11:22:15 +02:00
final CollectionInfo info = new CollectionInfo();
info.setId(jobId);
info.setOaiBaseUrl(baseUrl);
info.setOaiFormat(format);
info.setOaiSet(setSpec);
info.setOaiFrom(from);
info.setOaiUntil(until);
info.setStart(LocalDateTime.now());
info.setEnd(null);
info.setExecutionStatus(ExecutionStatus.READY);
info.setTotal(0);
info.setMessage("");
infoMap.put(jobId, info);
jobExecutor.execute(() -> {
try {
info.setExecutionStatus(ExecutionStatus.RUNNING);
2023-05-26 13:52:31 +02:00
oaiCollect(baseUrl, format, setSpec, from, until, sc, info);
2023-05-26 11:22:15 +02:00
info.setExecutionStatus(ExecutionStatus.COMPLETED);
} catch (final Throwable e) {
info.setExecutionStatus(ExecutionStatus.FAILED);
2023-05-26 12:03:27 +02:00
info.setMessage(e.getMessage() + ": " + ExceptionUtils.getStackTrace(e));
2023-05-26 11:22:15 +02:00
} finally {
2023-05-26 12:03:27 +02:00
info.setEnd(LocalDateTime.now());
2023-05-26 13:52:31 +02:00
sc.disconnect();
2023-05-26 11:22:15 +02:00
collectionLogEntryRepository.save(SimpleUtils.infoToLog(info));
}
});
2023-05-26 11:22:15 +02:00
return info;
}
public void oaiCollect(final String baseUrl,
final String format,
final String setSpec,
final LocalDateTime from,
final LocalDateTime until,
2023-05-26 13:52:31 +02:00
final StorageClient sc,
2023-05-26 11:22:15 +02:00
final CollectionInfo info)
throws Exception {
String url = SimpleUtils.oaiFirstUrl(baseUrl, format, setSpec, from, until);
while (StringUtils.isNotBlank(url)) {
final CollectionCall call = new CollectionCall();
call.setUrl(url);
info.getCalls().add(call);
2023-05-26 11:22:15 +02:00
final String xml = HttpFetcher.download(call);
final Document doc = DocumentHelper.parseText(xml);
2023-05-26 11:22:15 +02:00
final List<Node> records = doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']");
call.setNumberOfRecords(records.size());
2023-05-26 11:22:15 +02:00
for (final Node n : records) {
2023-05-26 13:52:31 +02:00
final String id = n.valueOf(".//*[local-name()='header']/*[local-name()='identifier']");
sc.saveFile(SimpleUtils.oaiIdToFilename(id), n.asXML());
2023-05-26 11:22:15 +02:00
info.setTotal(info.getTotal() + 1);
}
final String rtoken = doc.valueOf("//*[local-name()='resumptionToken']").trim();
url = SimpleUtils.oaiNextUrl(baseUrl, rtoken);
}
}
2023-05-26 11:22:15 +02:00
public CollectionInfo getStatus(final String jobId) {
final CollectionInfo info = infoMap.get(jobId);
if (info != null) {
return info;
} else {
return collectionLogEntryRepository.findById(jobId)
2023-05-26 11:22:15 +02:00
.map(SimpleUtils::logToInfo)
2023-05-26 12:03:27 +02:00
.orElseThrow(() -> new RuntimeException("Invalid id: " + jobId));
}
}
2023-05-25 15:25:09 +02:00
@Scheduled(fixedRate = 30, timeUnit = TimeUnit.MINUTES)
public void cronCleanJobs() throws Exception {
2023-05-26 11:22:15 +02:00
final Set<String> toDelete = infoMap.entrySet()
2023-05-25 15:25:09 +02:00
.stream()
.filter(e -> {
2023-05-26 11:22:15 +02:00
final ExecutionStatus status = e.getValue().getExecutionStatus();
2023-05-25 15:25:09 +02:00
return status == ExecutionStatus.COMPLETED || status == ExecutionStatus.FAILED;
})
.filter(e -> {
2023-05-26 11:22:15 +02:00
final LocalDateTime end = e.getValue().getEnd();
2023-05-25 15:25:09 +02:00
final long hours = Duration.between(end, LocalDateTime.now()).toHours();
2023-05-26 11:22:15 +02:00
return Math.abs(hours) > fullInfoExpirationTime;
2023-05-25 15:25:09 +02:00
})
.map(e -> e.getKey())
.collect(Collectors.toSet());
log.info("Cleaning expired jobs: " + StringUtils.join(toDelete, ", "));
2023-05-26 11:22:15 +02:00
toDelete.forEach(infoMap::remove);
2023-05-25 15:25:09 +02:00
}
}