From 220980491904f3d3ffc07e081d4eeb523599684e Mon Sep 17 00:00:00 2001 From: "konstantina.galouni" Date: Thu, 25 Nov 2021 13:51:17 +0200 Subject: [PATCH] [Explore]: services/sitemaps: In sitemaps creation, added case for querying with subjects (instead of refine). 1. extractUrlsFromSearch.ts: a. Added input read for result type and for subject (can be given as empty). b. Added creation of sitemaps for results related to a specific subject (many methods are parametrized accordingly). 2. run.sh: Added a second parameter for subject. 3. .gitignore: Updated gitignore file to ignore sitemaps created locally in commit process. --- .gitignore | 4 + services/sitemaps/extractUrlsFromSearch.ts | 147 +++++++++++++++++---- services/sitemaps/run.sh | 2 +- 3 files changed, 128 insertions(+), 25 deletions(-) diff --git a/.gitignore b/.gitignore index 3ee4889..88adc7d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ **/.idea **/dist **/uploads +services/sitemaps/*.xml +services/sitemaps/*.txt +services/sitemaps/sitemaps +services/sitemaps/sitemaps.zip \ No newline at end of file diff --git a/services/sitemaps/extractUrlsFromSearch.ts b/services/sitemaps/extractUrlsFromSearch.ts index 214bfff..6bcebf4 100644 --- a/services/sitemaps/extractUrlsFromSearch.ts +++ b/services/sitemaps/extractUrlsFromSearch.ts @@ -13,6 +13,62 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent const request = require('superagent'); +function getForSubject(resultsPerUrl, resultsType, subject) { + setTimeout(async () => { + let allUrls = new Set(); + + let reqSubject: string = subjectMapping[subject]; + if (reqSubject == null) { + reqSubject = subject; + } + + let url = resultsUrlPrefix + "&query=(" + "resultsubject exact" + " \"" + encodeURIComponent(reqSubject) + "\")" + "&type=" + resultsType; + + let totalResults: number = 150000; + await new Promise((resolve, reject) => { + request.get(url+"&size=0&page=0", function (err: any, response: any) { + if (!response && err) { + reject(err); + } else { + totalResults = response.body['meta']['total']; + resolve(value => value); + } + }) + }).catch(error => { + console.error("Error getting results ", error); + fs.appendFileSync("./" + errorFileName, "no response " + url + " "); + fs.appendFileSync("./" + errorFileName, error); + fs.appendFileSync("./" + errorFileName, "\n"); + }) + + let pages: number = Math.ceil(totalResults/resultsPerUrl); + console.log("totalResults="+totalResults + " - pages="+pages); + for(let page=0; page setTimeout(resolve, 500)); + + promiseArray.push( + new Promise((resolve, reject) => { + request.get(url+"&size="+resultsPerUrl+"&page="+page, function (err: any, response: any) { + if (!response && err) { + reject(err); + } else { + parseAllUrls(response, allUrls, resultsType, subject); + resolve(value => value); + } + }) + }).catch(error => { + console.error("Error getting results ", error); + fs.appendFileSync("./" + errorFileName, "no response " + url + " "); + fs.appendFileSync("./" + errorFileName, error); + fs.appendFileSync("./" + errorFileName, "\n"); + }) + ); + } + + finalize(resultsPerUrl, resultsType); + }); +} + function get(resultsPerUrl, resultsType) { setTimeout(() => { let searchFields = new SearchFields(); @@ -82,24 +138,28 @@ function get(resultsPerUrl, resultsType) { console.log(""); } - await Promise.all(promiseArray); - console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin); - console.log("\nNo indexed urls: "+noIndexedUrls + " vs final urls: "+finalUrls); - console.log("\nPublications: "+publications + " - Datasets: "+datasets + - " - Software: "+software + " - Other: "+other + " --- urls with pid: "+urlsWithPid); - - fs.appendFile("./" + fileName, "\n", function (err) { - if (err) { - return console.log("Error appending in file "+fileName+": ", err); - } - console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)"); - }); + finalize(resultsPerUrl, resultsType); } }) }) } // }); +async function finalize(resultsPerUrl, resultsType) { + await Promise.all(promiseArray); + console.log("\nDuplicate urls: " + alreadyin + " vs unique urls: " + notin); + console.log("\nNo indexed urls: " + noIndexedUrls + " vs final urls: " + finalUrls); + console.log("\nPublications: " + publications + " - Datasets: " + datasets + + " - Software: " + software + " - Other: " + other + " --- urls with pid: " + urlsWithPid); + + fs.appendFile("./" + fileName, "\n", function (err) { + if (err) { + return console.log("Error appending in file " + fileName + ": ", err); + } + console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)"); + }); +} + function parseAllUrls_old(response: any, allUrls: any) { // let allUrls: any = []; @@ -157,7 +217,7 @@ function parseAllUrls_old(response: any, allUrls: any) { } -function parseAllUrls(response: any, allUrls: any, resultsType: string) { +function parseAllUrls(response: any, allUrls: any, resultsType: string, subject: string = null) { // let allUrls: any = []; let responses: any = response.body['results']; @@ -210,7 +270,7 @@ function parseAllUrls(response: any, allUrls: any, resultsType: string) { console.log("url to be added in file: "+url); fs.appendFileSync("./" + fileName, "\n"); console.log("\n"); - createSitemapFile(resultsType); + createSitemapFile(resultsType, subject); } finalUrls++; @@ -324,21 +384,25 @@ function getCommunities() { }).catch(error => console.error("Error getting communities ", error)); } -function buildSiteMap(resultsPerUrl, resultsType) { +function buildSiteMap(resultsPerUrl, resultsType, subject = null) { console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)"); let date = new Date(); - createSitemapFile(resultsType); - errorFileName = resultsType + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime(); - noIndexFileName = resultsType + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime(); + createSitemapFile(resultsType, subject); + errorFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime(); + noIndexFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime(); - getCommunities(); // comment out for communities query check - get(resultsPerUrl, resultsType); + if(subject) { + getForSubject(resultsPerUrl, resultsType, subject); + } else { + getCommunities(); // comment out for communities query check + get(resultsPerUrl, resultsType); + } } -async function createSitemapFile(resultsType) { +async function createSitemapFile(resultsType, subject=null) { // let date = new Date(); - fileName = resultsType + "_sitemap" + (filesCreated > 0 ? filesCreated : "") + fileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_sitemap" + (filesCreated > 0 ? filesCreated : "") // + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".xml";//+"_"+date.getTime(); filesCreated++; @@ -349,6 +413,39 @@ async function createSitemapFile(resultsType) { fs.writeFileSync("./" + fileName, sitemap); } +function askQuestion(query) { + const readline = require('readline'); + + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout, + }); + + return new Promise(resolve => rl.question(query, ans => { + rl.close(); + resolve(ans); + })) +} + +async function start(resultsType, subject = null) { + if(resultsType == null) { + resultsType = await askQuestion("Please provide type of results (publications, datasets, software, other): "); + } + console.log("type is: " + resultsType); + + if(subject == null) { + subject = await askQuestion("Please provide subject. " + + "Available subjects are \"Physics::Atomic Physics\" or physics, \"Mathematics::Combinatorics\" or mathematics, " + + "any other subject you want or no value if no subject: "); + if(!subject) { + subject = null; + } + } + console.log("subject is: " + subject); + + buildSiteMap(200, resultsType, subject); +} + let filesCreated = 0; let fileName; let errorFileName; @@ -379,5 +476,7 @@ const landingPrefix = "https://explore.openaire.eu/search/"; let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type="; const contextUrl = "https://services.openaire.eu/openaire/contexts/"; -// process.argc[3] is the "resultType" argument -buildSiteMap(200, process.argv[3]); +const subjectMapping = {"physics": "Physics::Atomic Physics", "mathematics": "Mathematics::Combinatorics"} + +// process.argc[3] is the "resultType" argument, process.argc[4] is the "subject" argument +start(process.argv[3], process.argv[4]); diff --git a/services/sitemaps/run.sh b/services/sitemaps/run.sh index cd4becb..61314c5 100755 --- a/services/sitemaps/run.sh +++ b/services/sitemaps/run.sh @@ -1 +1 @@ -npx ts-node extractUrlsFromSearch.ts -- $1 +npx ts-node extractUrlsFromSearch.ts -- $1 $2