dnet-applications/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapeState.java

package eu.dnetlib.bioschemas.api.scraper;

import eu.dnetlib.bioschemas.api.crawl.StatusOfScrape;
import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

/**

 */
public class ScrapeState {

	private List<CrawlRecord> urlsToScrape = Collections.synchronizedList(new ArrayList<CrawlRecord>());
	private List<CrawlRecord> urlsProcessed = Collections.synchronizedList(new ArrayList<CrawlRecord>());  // should this be a set?
	private Map<String, Object> nquadsConcurrentHashMap = new ConcurrentHashMap<>();

	/**
	 * 
	 * @param pagesToBeScraped The list of sites to be scraped
	 * @see ScrapeThread
	 * @see CrawlRecord
	 */
	public ScrapeState(List<CrawlRecord> pagesToBeScraped) {
		urlsToScrape.addAll(pagesToBeScraped);
	}

	/**
	 * Any pages/URLs left to scrape?
	 * @return True for yes & false for no
	 * @see CrawlRecord
	 */
	public synchronized boolean pagesLeftToScrape() {
		return !urlsToScrape.isEmpty();
	}

	/**
	 * Returns the next URL/CrawlRecord to be scraped
	 * 
	 * @return First page/URL that needs to be scraped next
	 * @see CrawlRecord
	 */
	public synchronized CrawlRecord getURLToProcess() {
		if (urlsToScrape.isEmpty())
			return null;

		return urlsToScrape.remove(0);
	}

	/**
	 * Adds the given CrawlRecord to the list of CrawlRecords successfully scraped.
	 * Updates the status of the CrawlRecord to SUCCESS.
	 * 
	 * @param url The latest URL/page that has been successfully scraped
	 * @see CrawlRecord
	 */
	public synchronized void addSuccessfulScrapedURL(CrawlRecord record) {
		record.setStatus(StatusOfScrape.SUCCESS);
		urlsProcessed.add(record);
	}

	/**
	 * Adds the given CrawlRecord to the list of CrawlRecords NOT successfully scraped.
	 * Updates the status of the CrawlRecord; if first failure the status is FAILED.
	 * If status is already FAILED it is changed to GIVEN_UP.
	 * 
	 * If the status is FAILED, another try will be made in a future run.
	 *  
	 * 
	 * @param url The latest URL/page that has been unsuccessfully scraped
	 * @see CrawlRecord
	 */
	public synchronized void addFailedToScrapeURL(CrawlRecord record) {
		if (record.getStatus().equals(StatusOfScrape.FAILED)) {
			record.setStatus(StatusOfScrape.GIVEN_UP);
		} else {
			record.setStatus(StatusOfScrape.FAILED);
		}
		urlsProcessed.add(record);
	}

	/**
	 * Changes the status of the CrawlRecord to DOES_NOT_EXIST.
	 * As Selenium does not return the HTTP codes, it is questionable 
	 * how useful this is.
	 * 
	 * 
	 * @param url The latest URL/page that has been 404'd
	 * @see CrawlRecord
	 */
	public synchronized void setStatusTo404(CrawlRecord record) {
		record.setStatus(StatusOfScrape.DOES_NOT_EXIST);
		urlsProcessed.add(record);
	}	
	
	
	/**
	 * 
	 * Changes the status of the CrawlRecord to HUMAN_INSPECTION.
	 * This captures the idea that the URLs may contain unexpected markup that needs a human to 
	 * review and possibly update the scraper. 
	 * 
	 * @param url The latest URL/page that needs human inspection
	 * @see CrawlRecord
	 */
	public synchronized void setStatusToHumanInspection(CrawlRecord record) {
		record.setStatus(StatusOfScrape.HUMAN_INSPECTION);
		urlsProcessed.add(record);
	}	
	
	
	/**
	 * Returns the number of URLs that are still to be scraped in this cycle. 
	 * This does not return the number of URLs left to scrape in the DBMS, just in the current cycle.
	 * 
	 * @return Number of URLs left to scrape in this cycle
	 * @see CrawlRecord
	 */
	public synchronized int getNumberPagesLeftToScrape() {
		return urlsToScrape.size();
	}

	/**
	 * Gets the full list of URLs that have been processed in this cycle.
	 * This does not return the number of URLs that have been scraped in total across all cycles.
	 * 
	 * @return
	 * @see CrawlRecord
	 */
	public synchronized List<CrawlRecord> getPagesProcessed() {
		return urlsProcessed;
	}
	
	/**
	 * Gets the full list of URLs/CrawlRecords regardless of whether scraped or not in the current cycle.
	 * 
	 * @return List of all CrawlRecords in this cycle.
	 * @see CrawlRecord
	 */
	public synchronized List<CrawlRecord> getPagesProcessedAndUnprocessed() {
		List<CrawlRecord> urlsCombined = Collections.synchronizedList(new ArrayList<CrawlRecord>());
		urlsCombined.addAll(urlsProcessed);
		urlsCombined.addAll(urlsToScrape);
		return urlsCombined;
	}

	public void addNquads(String key, String nquads) {
		nquadsConcurrentHashMap.putIfAbsent(key, nquads);
	}

	public Map<String, Object> getNquadsConcurrentHashMap() {
		return nquadsConcurrentHashMap;
	}
}
refactoring; webapp context set to bioschemas 2022-06-07 13:38:14 +02:00			`package eu.dnetlib.bioschemas.api.scraper;`
added app for bioschemas sources harvesting 2022-06-06 09:37:29 +02:00
refactoring; webapp context set to bioschemas 2022-06-07 13:38:14 +02:00			`import eu.dnetlib.bioschemas.api.crawl.StatusOfScrape;`
			`import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;`
added app for bioschemas sources harvesting 2022-06-06 09:37:29 +02:00
			`import java.util.ArrayList;`
			`import java.util.Collections;`
			`import java.util.List;`
			`import java.util.Map;`
			`import java.util.concurrent.ConcurrentHashMap;`

			`/**`

			`*/`
			`public class ScrapeState {`

			`private List<CrawlRecord> urlsToScrape = Collections.synchronizedList(new ArrayList<CrawlRecord>());`
			`private List<CrawlRecord> urlsProcessed = Collections.synchronizedList(new ArrayList<CrawlRecord>()); // should this be a set?`
			`private Map<String, Object> nquadsConcurrentHashMap = new ConcurrentHashMap<>();`

			`/**`
			`*`
			`* @param pagesToBeScraped The list of sites to be scraped`
			`* @see ScrapeThread`
			`* @see CrawlRecord`
			`*/`
			`public ScrapeState(List<CrawlRecord> pagesToBeScraped) {`
			`urlsToScrape.addAll(pagesToBeScraped);`
			`}`

			`/**`
			`* Any pages/URLs left to scrape?`
			`* @return True for yes & false for no`
			`* @see CrawlRecord`
			`*/`
			`public synchronized boolean pagesLeftToScrape() {`
			`return !urlsToScrape.isEmpty();`
			`}`

			`/**`
			`* Returns the next URL/CrawlRecord to be scraped`
			`*`
			`* @return First page/URL that needs to be scraped next`
			`* @see CrawlRecord`
			`*/`
			`public synchronized CrawlRecord getURLToProcess() {`
			`if (urlsToScrape.isEmpty())`
			`return null;`

			`return urlsToScrape.remove(0);`
			`}`

			`/**`
			`* Adds the given CrawlRecord to the list of CrawlRecords successfully scraped.`
			`* Updates the status of the CrawlRecord to SUCCESS.`
			`*`
			`* @param url The latest URL/page that has been successfully scraped`
			`* @see CrawlRecord`
			`*/`
			`public synchronized void addSuccessfulScrapedURL(CrawlRecord record) {`
			`record.setStatus(StatusOfScrape.SUCCESS);`
			`urlsProcessed.add(record);`
			`}`

			`/**`
			`* Adds the given CrawlRecord to the list of CrawlRecords NOT successfully scraped.`
			`* Updates the status of the CrawlRecord; if first failure the status is FAILED.`
			`* If status is already FAILED it is changed to GIVEN_UP.`
			`*`
			`* If the status is FAILED, another try will be made in a future run.`
			`*`
			`*`
			`* @param url The latest URL/page that has been unsuccessfully scraped`
			`* @see CrawlRecord`
			`*/`
			`public synchronized void addFailedToScrapeURL(CrawlRecord record) {`
			`if (record.getStatus().equals(StatusOfScrape.FAILED)) {`
			`record.setStatus(StatusOfScrape.GIVEN_UP);`
			`} else {`
			`record.setStatus(StatusOfScrape.FAILED);`
			`}`
			`urlsProcessed.add(record);`
			`}`

			`/**`
			`* Changes the status of the CrawlRecord to DOES_NOT_EXIST.`
			`* As Selenium does not return the HTTP codes, it is questionable`
			`* how useful this is.`
			`*`
			`*`
			`* @param url The latest URL/page that has been 404'd`
			`* @see CrawlRecord`
			`*/`
			`public synchronized void setStatusTo404(CrawlRecord record) {`
			`record.setStatus(StatusOfScrape.DOES_NOT_EXIST);`
			`urlsProcessed.add(record);`
			`}`


			`/**`
			`*`
			`* Changes the status of the CrawlRecord to HUMAN_INSPECTION.`
			`* This captures the idea that the URLs may contain unexpected markup that needs a human to`
			`* review and possibly update the scraper.`
			`*`
			`* @param url The latest URL/page that needs human inspection`
			`* @see CrawlRecord`
			`*/`
			`public synchronized void setStatusToHumanInspection(CrawlRecord record) {`
			`record.setStatus(StatusOfScrape.HUMAN_INSPECTION);`
			`urlsProcessed.add(record);`
			`}`


			`/**`
			`* Returns the number of URLs that are still to be scraped in this cycle.`
			`* This does not return the number of URLs left to scrape in the DBMS, just in the current cycle.`
			`*`
			`* @return Number of URLs left to scrape in this cycle`
			`* @see CrawlRecord`
			`*/`
			`public synchronized int getNumberPagesLeftToScrape() {`
			`return urlsToScrape.size();`
			`}`

			`/**`
			`* Gets the full list of URLs that have been processed in this cycle.`
			`* This does not return the number of URLs that have been scraped in total across all cycles.`
			`*`
			`* @return`
			`* @see CrawlRecord`
			`*/`
			`public synchronized List<CrawlRecord> getPagesProcessed() {`
			`return urlsProcessed;`
			`}`

			`/**`
			`* Gets the full list of URLs/CrawlRecords regardless of whether scraped or not in the current cycle.`
			`*`
			`* @return List of all CrawlRecords in this cycle.`
			`* @see CrawlRecord`
			`*/`
			`public synchronized List<CrawlRecord> getPagesProcessedAndUnprocessed() {`
			`List<CrawlRecord> urlsCombined = Collections.synchronizedList(new ArrayList<CrawlRecord>());`
			`urlsCombined.addAll(urlsProcessed);`
			`urlsCombined.addAll(urlsToScrape);`
			`return urlsCombined;`
			`}`

			`public void addNquads(String key, String nquads) {`
			`nquadsConcurrentHashMap.putIfAbsent(key, nquads);`
			`}`

			`public Map<String, Object> getNquadsConcurrentHashMap() {`
			`return nquadsConcurrentHashMap;`
			`}`
			`}`