forked from D-Net/dnet-hadoop
66 lines
1.8 KiB
Java
66 lines
1.8 KiB
Java
|
|
package eu.dnetlib.dhp.bmuse.utils;
|
|
|
|
import java.io.IOException;
|
|
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.select.Elements;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import hwu.elixir.utils.Helpers;
|
|
|
|
public class UrlParser {
|
|
|
|
private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
|
|
|
|
public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
|
|
|
|
Document doc = new Document(url);
|
|
Document urlSitemapListsNested;
|
|
Elements elements = new Elements();
|
|
Elements sitemaps = new Elements();
|
|
boolean sitemapindex = false;
|
|
boolean urlset = false;
|
|
|
|
try {
|
|
int urlLength = url.length();
|
|
logger.info("parse sitemap list");
|
|
String sitemapExt = url.substring(urlLength - 3, urlLength);
|
|
if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
|
|
logger.info("compressed sitemap");
|
|
byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
|
|
doc = Helpers.gzipFileDecompression(bytes);
|
|
} else {
|
|
doc = Jsoup.connect(url).maxBodySize(0).get();
|
|
}
|
|
|
|
} catch (IOException e) {
|
|
logger.error("Jsoup parsing exception: " + e.getMessage());
|
|
}
|
|
|
|
try {
|
|
|
|
elements = doc.select(sitemapURLKey);
|
|
|
|
// check the html if it is a sitemapindex or a urlset
|
|
sitemapindex = doc.outerHtml().contains("sitemapindex");
|
|
urlset = doc.outerHtml().contains("urlset");
|
|
} catch (NullPointerException e) {
|
|
logger.error(e.getMessage());
|
|
}
|
|
|
|
if (sitemapindex) {
|
|
// if sitemapindex get the loc of all the sitemaps
|
|
// added warning for sitemap index files
|
|
logger
|
|
.warn(
|
|
"please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
|
|
sitemaps = doc.select(sitemapURLKey);
|
|
}
|
|
|
|
return elements;
|
|
}
|
|
}
|