forked from D-Net/dnet-hadoop
changed log level
This commit is contained in:
parent
1ffc8d4945
commit
4797cc460b
|
@ -25,7 +25,6 @@ import eu.dnetlib.dhp.bmuse.utils.UrlParser;
|
||||||
public class ScrapingJob {
|
public class ScrapingJob {
|
||||||
|
|
||||||
static Logger logger = LoggerFactory.getLogger(ScrapingJob.class);
|
static Logger logger = LoggerFactory.getLogger(ScrapingJob.class);
|
||||||
// private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
@ -50,9 +49,6 @@ public class ScrapingJob {
|
||||||
}
|
}
|
||||||
final boolean scrapingType = dynamicValue.booleanValue();
|
final boolean scrapingType = dynamicValue.booleanValue();
|
||||||
|
|
||||||
// AtomicLong scraped = new AtomicLong(0l);
|
|
||||||
// AtomicLong errors = new AtomicLong(0l);
|
|
||||||
|
|
||||||
logger
|
logger
|
||||||
.info(
|
.info(
|
||||||
"*************************** STARTING_SCRAPE");
|
"*************************** STARTING_SCRAPE");
|
||||||
|
@ -60,7 +56,6 @@ public class ScrapingJob {
|
||||||
BMUSEScraper scraper = new BMUSEScraper();
|
BMUSEScraper scraper = new BMUSEScraper();
|
||||||
String url = sitemapUrl.toLowerCase();
|
String url = sitemapUrl.toLowerCase();
|
||||||
Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
|
Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
|
||||||
long total = urls.size();
|
|
||||||
|
|
||||||
Path output = new Path(
|
Path output = new Path(
|
||||||
nameNode
|
nameNode
|
||||||
|
@ -91,10 +86,8 @@ public class ScrapingJob {
|
||||||
nquads = scraper.scrapeUrl(site, scrapingType);
|
nquads = scraper.scrapeUrl(site, scrapingType);
|
||||||
final Text value = new Text(nquads);
|
final Text value = new Text(nquads);
|
||||||
writer.append(key, value);
|
writer.append(key, value);
|
||||||
// scraped.getAndIncrement();
|
|
||||||
} catch (Throwable t) {
|
} catch (Throwable t) {
|
||||||
logger.error(u.text() + " " + t.getMessage());
|
logger.error(u.text() + " " + t.getMessage());
|
||||||
// errors.getAndIncrement();
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,7 @@ public class BMUSEScraper extends ScraperFilteredCore {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName());
|
private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName());
|
||||||
|
|
||||||
public String scrapeUrl(String url, Boolean dynamic) throws Exception {
|
public String scrapeUrl(String url, Boolean dynamic) throws Exception {
|
||||||
logger.info(url + " > scraping");
|
logger.debug(url + " > scraping");
|
||||||
url = fixURL(url);
|
url = fixURL(url);
|
||||||
|
|
||||||
String html = "";
|
String html = "";
|
||||||
|
@ -49,24 +49,24 @@ public class BMUSEScraper extends ScraperFilteredCore {
|
||||||
|
|
||||||
html = injectId(html, url);
|
html = injectId(html, url);
|
||||||
|
|
||||||
logger.info(url + " > html scraped from " + url);
|
logger.debug(url + " > html scraped from " + url);
|
||||||
DocumentSource source = new StringDocumentSource(html, url);
|
DocumentSource source = new StringDocumentSource(html, url);
|
||||||
String n3 = html2Triples(source, url);
|
String n3 = html2Triples(source, url);
|
||||||
if (n3 == null) {
|
if (n3 == null) {
|
||||||
throw new MissingMarkupException(url);
|
throw new MissingMarkupException(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(url + " > processing triples");
|
logger.debug(url + " > processing triples");
|
||||||
IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
|
IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
|
||||||
Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
|
Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
|
||||||
if (updatedModel == null) {
|
if (updatedModel == null) {
|
||||||
throw new Exception("rdf model null");
|
throw new Exception("rdf model null");
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(url + " > generating nquads");
|
logger.debug(url + " > generating nquads");
|
||||||
try (StringWriter jsonLDWriter = new StringWriter()) {
|
try (StringWriter jsonLDWriter = new StringWriter()) {
|
||||||
Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
|
Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
|
||||||
logger.info(url + " > nquads generated");
|
logger.debug(url + " > nquads generated");
|
||||||
return jsonLDWriter.toString();
|
return jsonLDWriter.toString();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw e;
|
throw e;
|
||||||
|
|
Loading…
Reference in New Issue