package eu.dnetlib.dhp.bmuse.utils; import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import hwu.elixir.utils.Helpers; public class UrlParser { private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName()); public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException { Document doc = new Document(url); Document urlSitemapListsNested; Elements elements = new Elements(); Elements sitemaps = new Elements(); boolean sitemapindex = false; boolean urlset = false; try { int urlLength = url.length(); logger.info("parse sitemap list"); String sitemapExt = url.substring(urlLength - 3, urlLength); if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending logger.info("compressed sitemap"); byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes(); doc = Helpers.gzipFileDecompression(bytes); } else { doc = Jsoup.connect(url).maxBodySize(0).get(); } } catch (IOException e) { logger.error("Jsoup parsing exception: " + e.getMessage()); } try { elements = doc.select(sitemapURLKey); // check the html if it is a sitemapindex or a urlset sitemapindex = doc.outerHtml().contains("sitemapindex"); urlset = doc.outerHtml().contains("urlset"); } catch (NullPointerException e) { logger.error(e.getMessage()); } if (sitemapindex) { // if sitemapindex get the loc of all the sitemaps // added warning for sitemap index files logger .warn( "please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead"); sitemaps = doc.select(sitemapURLKey); } return elements; } }