dnet-applications/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapeState.java

157 lines
4.5 KiB
Java

package eu.dnetlib.bioschemas.api.scraper;
import eu.dnetlib.bioschemas.api.crawl.StatusOfScrape;
import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
*/
public class ScrapeState {
private List<CrawlRecord> urlsToScrape = Collections.synchronizedList(new ArrayList<CrawlRecord>());
private List<CrawlRecord> urlsProcessed = Collections.synchronizedList(new ArrayList<CrawlRecord>()); // should this be a set?
private Map<String, Object> nquadsConcurrentHashMap = new ConcurrentHashMap<>();
/**
*
* @param pagesToBeScraped The list of sites to be scraped
* @see ScrapeThread
* @see CrawlRecord
*/
public ScrapeState(List<CrawlRecord> pagesToBeScraped) {
urlsToScrape.addAll(pagesToBeScraped);
}
/**
* Any pages/URLs left to scrape?
* @return True for yes & false for no
* @see CrawlRecord
*/
public synchronized boolean pagesLeftToScrape() {
return !urlsToScrape.isEmpty();
}
/**
* Returns the next URL/CrawlRecord to be scraped
*
* @return First page/URL that needs to be scraped next
* @see CrawlRecord
*/
public synchronized CrawlRecord getURLToProcess() {
if (urlsToScrape.isEmpty())
return null;
return urlsToScrape.remove(0);
}
/**
* Adds the given CrawlRecord to the list of CrawlRecords successfully scraped.
* Updates the status of the CrawlRecord to SUCCESS.
*
* @param url The latest URL/page that has been successfully scraped
* @see CrawlRecord
*/
public synchronized void addSuccessfulScrapedURL(CrawlRecord record) {
record.setStatus(StatusOfScrape.SUCCESS);
urlsProcessed.add(record);
}
/**
* Adds the given CrawlRecord to the list of CrawlRecords NOT successfully scraped.
* Updates the status of the CrawlRecord; if first failure the status is FAILED.
* If status is already FAILED it is changed to GIVEN_UP.
*
* If the status is FAILED, another try will be made in a future run.
*
*
* @param url The latest URL/page that has been unsuccessfully scraped
* @see CrawlRecord
*/
public synchronized void addFailedToScrapeURL(CrawlRecord record) {
if (record.getStatus().equals(StatusOfScrape.FAILED)) {
record.setStatus(StatusOfScrape.GIVEN_UP);
} else {
record.setStatus(StatusOfScrape.FAILED);
}
urlsProcessed.add(record);
}
/**
* Changes the status of the CrawlRecord to DOES_NOT_EXIST.
* As Selenium does not return the HTTP codes, it is questionable
* how useful this is.
*
*
* @param url The latest URL/page that has been 404'd
* @see CrawlRecord
*/
public synchronized void setStatusTo404(CrawlRecord record) {
record.setStatus(StatusOfScrape.DOES_NOT_EXIST);
urlsProcessed.add(record);
}
/**
*
* Changes the status of the CrawlRecord to HUMAN_INSPECTION.
* This captures the idea that the URLs may contain unexpected markup that needs a human to
* review and possibly update the scraper.
*
* @param url The latest URL/page that needs human inspection
* @see CrawlRecord
*/
public synchronized void setStatusToHumanInspection(CrawlRecord record) {
record.setStatus(StatusOfScrape.HUMAN_INSPECTION);
urlsProcessed.add(record);
}
/**
* Returns the number of URLs that are still to be scraped in this cycle.
* This does not return the number of URLs left to scrape in the DBMS, just in the current cycle.
*
* @return Number of URLs left to scrape in this cycle
* @see CrawlRecord
*/
public synchronized int getNumberPagesLeftToScrape() {
return urlsToScrape.size();
}
/**
* Gets the full list of URLs that have been processed in this cycle.
* This does not return the number of URLs that have been scraped in total across all cycles.
*
* @return
* @see CrawlRecord
*/
public synchronized List<CrawlRecord> getPagesProcessed() {
return urlsProcessed;
}
/**
* Gets the full list of URLs/CrawlRecords regardless of whether scraped or not in the current cycle.
*
* @return List of all CrawlRecords in this cycle.
* @see CrawlRecord
*/
public synchronized List<CrawlRecord> getPagesProcessedAndUnprocessed() {
List<CrawlRecord> urlsCombined = Collections.synchronizedList(new ArrayList<CrawlRecord>());
urlsCombined.addAll(urlsProcessed);
urlsCombined.addAll(urlsToScrape);
return urlsCombined;
}
public void addNquads(String key, String nquads) {
nquadsConcurrentHashMap.putIfAbsent(key, nquads);
}
public Map<String, Object> getNquadsConcurrentHashMap() {
return nquadsConcurrentHashMap;
}
}