dnet-hadoop/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java

66 lines
1.8 KiB
Java

package eu.dnetlib.dhp.bmuse.utils;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import hwu.elixir.utils.Helpers;
public class UrlParser {
private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
Document doc = new Document(url);
Document urlSitemapListsNested;
Elements elements = new Elements();
Elements sitemaps = new Elements();
boolean sitemapindex = false;
boolean urlset = false;
try {
int urlLength = url.length();
logger.info("parse sitemap list");
String sitemapExt = url.substring(urlLength - 3, urlLength);
if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
logger.info("compressed sitemap");
byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
doc = Helpers.gzipFileDecompression(bytes);
} else {
doc = Jsoup.connect(url).maxBodySize(0).get();
}
} catch (IOException e) {
logger.error("Jsoup parsing exception: " + e.getMessage());
}
try {
elements = doc.select(sitemapURLKey);
// check the html if it is a sitemapindex or a urlset
sitemapindex = doc.outerHtml().contains("sitemapindex");
urlset = doc.outerHtml().contains("urlset");
} catch (NullPointerException e) {
logger.error(e.getMessage());
}
if (sitemapindex) {
// if sitemapindex get the loc of all the sitemaps
// added warning for sitemap index files
logger
.warn(
"please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
sitemaps = doc.select(sitemapURLKey);
}
return elements;
}
}