From 53d6d773aab696fab042c561be32e9007525e543 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 12 Jan 2021 10:26:39 +0100 Subject: [PATCH] added scraper for production gateways --- src/utils/xquery/scraper.xq | 94 +++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 src/utils/xquery/scraper.xq diff --git a/src/utils/xquery/scraper.xq b/src/utils/xquery/scraper.xq new file mode 100644 index 0000000..518ea79 --- /dev/null +++ b/src/utils/xquery/scraper.xq @@ -0,0 +1,94 @@ + +declare variable $url := 'https://services.d4science.org/thematic-gateways'; + + +declare function local:list() { + let $req := + return http:send-request($req, $url)//a[@class='entry-link']/@href/data() ! replace(., '/explore', '') +}; + +declare %basex:inline function local:get-title($page){ + substring-before($page/html/head/title, " -") +}; + +declare %basex:inline function local:get-favicon-url($page){ + $page/html/head/link[@rel="Shortcut Icon"]/string(@href) +}; + +declare %basex:inline function local:get-background-image($page){ + () +}; + +declare %basex:inline function local:get-logo-url($page){ + $page/html/body//h1[contains(@class, "site-title")]//img/string(@src) +}; + +declare %basex:inline function local:get-logo-alt($page){ + $page/html/body//h1[contains(@class, "site-title")]//img/string(@alt) +}; + +declare %basex:inline function local:get-infrastructure-logo($page){ + if(exists($page/html/body//div[contains(@class,"poweredBy-link")]//img)) then "yes" else "no" +}; + +declare %basex:inline function local:get-terms-url($page){ + $page/html/body//footer//a[@href = "/terms-of-use"]/@href +}; + +declare %basex:inline function local:get-cookiepolicy-url($page){ + $page/html/body//footer//a[@href = "/cookie-policy"]/@href +}; + +declare %basex:inline function local:get-privacypolicy-url($page){ + $page/html/body//footer//a[text() = "Privacy Policy"]/@href +}; + +declare %basex:inline function local:get-project-url($page){ + $page/html/body//footer/div[contains(@class, "custom-footer-container")]/a[last()]/@href +}; + +declare %basex:inline function local:get-project-description($page){ + () +}; + +declare %basex:inline function local:get-ec-logo($page){ + if(exists( + $page/html/body//footer/div[not(contains(@class, "custom-footer-container"))]/a[@href = "http://ec.europa.eu/programmes/horizon2020/"])) + then "yes" else "no" +}; + +declare %basex:inline function local:get-footer($page){ + string-join($page/html/body/footer/div/div//text() ! replace(., "'", "''"), "
") +}; + +declare %basex:inline function local:to-ansible-call($params as map(*)){ + string-join(("./keycloak-action.sh inject-theme", + map:keys($params) ! ("-e '" || . || "="" || $params(.) || ""'") + ), " ") +}; + +declare %basex:inline function local:transform($page-addr){ + let $page := html:parse(fetch:text($page-addr)) + return + local:to-ansible-call( + map{ + "theme" : substring-after($page-addr, "://"), + "title_tag" : local:get-title($page), + "favicon_url" : local:get-favicon-url($page), + "logo_url" : $page-addr || local:get-logo-url($page), + "logo_alt" : local:get-logo-alt($page), + "infrastructure_logo" : local:get-infrastructure-logo($page), + "background_image" : local:get-background-image($page), + "terms_url" : $page-addr || local:get-terms-url($page), + "cookie_policy_url" : $page-addr || local:get-cookiepolicy-url($page), + "privacy_policy_url" : "" || local:get-privacypolicy-url($page), + "project_url" : "" || local:get-project-url($page), + "project_description" : local:get-project-description($page), + "EC_logo" : local:get-ec-logo($page), + "footer" : local:get-footer($page) + } + ) +}; + +for $page-addr in local:list() +return xquery:fork-join(function(){ local:transform($page-addr)})