added scraper for production gateways
This commit is contained in:
parent
26437ea437
commit
53d6d773aa
|
@ -0,0 +1,94 @@
|
|||
|
||||
declare variable $url := 'https://services.d4science.org/thematic-gateways';
|
||||
|
||||
|
||||
declare function local:list() {
|
||||
let $req := <http:request method="get" />
|
||||
return http:send-request($req, $url)//a[@class='entry-link']/@href/data() ! replace(., '/explore', '')
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-title($page){
|
||||
substring-before($page/html/head/title, " -")
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-favicon-url($page){
|
||||
$page/html/head/link[@rel="Shortcut Icon"]/string(@href)
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-background-image($page){
|
||||
()
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-logo-url($page){
|
||||
$page/html/body//h1[contains(@class, "site-title")]//img/string(@src)
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-logo-alt($page){
|
||||
$page/html/body//h1[contains(@class, "site-title")]//img/string(@alt)
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-infrastructure-logo($page){
|
||||
if(exists($page/html/body//div[contains(@class,"poweredBy-link")]//img)) then "yes" else "no"
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-terms-url($page){
|
||||
$page/html/body//footer//a[@href = "/terms-of-use"]/@href
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-cookiepolicy-url($page){
|
||||
$page/html/body//footer//a[@href = "/cookie-policy"]/@href
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-privacypolicy-url($page){
|
||||
$page/html/body//footer//a[text() = "Privacy Policy"]/@href
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-project-url($page){
|
||||
$page/html/body//footer/div[contains(@class, "custom-footer-container")]/a[last()]/@href
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-project-description($page){
|
||||
()
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-ec-logo($page){
|
||||
if(exists(
|
||||
$page/html/body//footer/div[not(contains(@class, "custom-footer-container"))]/a[@href = "http://ec.europa.eu/programmes/horizon2020/"]))
|
||||
then "yes" else "no"
|
||||
};
|
||||
|
||||
declare %basex:inline function local:get-footer($page){
|
||||
string-join($page/html/body/footer/div/div//text() ! replace(., "'", "''"), "<br/>")
|
||||
};
|
||||
|
||||
declare %basex:inline function local:to-ansible-call($params as map(*)){
|
||||
string-join(("./keycloak-action.sh inject-theme",
|
||||
map:keys($params) ! ("-e '" || . || "="" || $params(.) || ""'")
|
||||
), " ")
|
||||
};
|
||||
|
||||
declare %basex:inline function local:transform($page-addr){
|
||||
let $page := html:parse(fetch:text($page-addr))
|
||||
return
|
||||
local:to-ansible-call(
|
||||
map{
|
||||
"theme" : substring-after($page-addr, "://"),
|
||||
"title_tag" : local:get-title($page),
|
||||
"favicon_url" : local:get-favicon-url($page),
|
||||
"logo_url" : $page-addr || local:get-logo-url($page),
|
||||
"logo_alt" : local:get-logo-alt($page),
|
||||
"infrastructure_logo" : local:get-infrastructure-logo($page),
|
||||
"background_image" : local:get-background-image($page),
|
||||
"terms_url" : $page-addr || local:get-terms-url($page),
|
||||
"cookie_policy_url" : $page-addr || local:get-cookiepolicy-url($page),
|
||||
"privacy_policy_url" : "" || local:get-privacypolicy-url($page),
|
||||
"project_url" : "" || local:get-project-url($page),
|
||||
"project_description" : local:get-project-description($page),
|
||||
"EC_logo" : local:get-ec-logo($page),
|
||||
"footer" : local:get-footer($page)
|
||||
}
|
||||
)
|
||||
};
|
||||
|
||||
for $page-addr in local:list()
|
||||
return xquery:fork-join(function(){ local:transform($page-addr)})
|
Loading…
Reference in New Issue