diff --git a/services/sitemaps/extractUrlsFromSearch.ts b/services/sitemaps/extractUrlsFromSearch.ts index aebba9ab..6c86a3c2 100644 --- a/services/sitemaps/extractUrlsFromSearch.ts +++ b/services/sitemaps/extractUrlsFromSearch.ts @@ -13,11 +13,13 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent const request = require('superagent'); -function get(resultsPerUrl) { +function get(resultsPerUrl, resultsType) { setTimeout(() => { let searchFields = new SearchFields(); let fieldIdsMap = searchFields.RESULT_FIELDS; + refineUrl += resultsType; + request.get(refineUrl, async function (err: any, refineResponse: any) { if (!refineResponse && err) { console.error("Error getting refine filters ",err); @@ -27,8 +29,6 @@ function get(resultsPerUrl) { let allUrls = new Set(); - let promiseArray = []; - for (let key of keys) { // comment out for communities query check if(key == "community") { @@ -59,7 +59,7 @@ function get(resultsPerUrl) { } } - const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl; + const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=" + resultsType + "&page=0&size=" + resultsPerUrl; await new Promise(resolve => setTimeout(resolve, 500)); promiseArray.push( @@ -68,7 +68,7 @@ function get(resultsPerUrl) { if (!response && err) { reject(err); } else { - parseAllUrls(response, allUrls); + parseAllUrls(response, allUrls, resultsType); resolve(value => value); } }) @@ -92,7 +92,7 @@ function get(resultsPerUrl) { if (err) { return console.log("Error appending in file "+fileName+": ", err); } - console.timeEnd("total_time"); + console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)"); }); } }) @@ -157,7 +157,7 @@ function parseAllUrls_old(response: any, allUrls: any) { } -function parseAllUrls(response: any, allUrls: any) { +function parseAllUrls(response: any, allUrls: any, resultsType: string) { // let allUrls: any = []; let responses: any = response.body['results']; @@ -206,6 +206,13 @@ function parseAllUrls(response: any, allUrls: any) { noIndexedUrls++; fs.appendFileSync("./"+noIndexFileName, url+"\n"); } else { + if(finalUrls > 0 && ((finalUrls % 50000) == 0)) { + console.log("url to be added in file: "+url); + fs.appendFileSync("./" + fileName, "\n"); + console.log("\n"); + createSitemapFile(resultsType); + } + finalUrls++; let urlPre = "\n"; let urlSuf = ""; @@ -317,43 +324,47 @@ function getCommunities() { }).catch(error => console.error("Error getting communities ", error)); } -function buildSiteMap(resultsPerUrl) { - console.time("total_time"); +function buildSiteMap(resultsPerUrl, resultsType) { + console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)"); let date = new Date(); - fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime(); - errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime(); - noIndexFileName = "noIndex_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime(); - console.log("Buiding sitemap in file: "+fileName+"\n"); - - let sitemap = "\n" + - ""; - - fs.writeFile("./"+fileName, sitemap, function(err) { - if(err) { - return console.log("Error writing in file "+fileName+": ", err); - } - }); + createSitemapFile(resultsType); + errorFileName = resultsType + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime(); + noIndexFileName = resultsType + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime(); getCommunities(); // comment out for communities query check - get(resultsPerUrl); + get(resultsPerUrl, resultsType); } +async function createSitemapFile(resultsType) { + // let date = new Date(); + fileName = resultsType + "_sitemap" + (filesCreated > 0 ? filesCreated : "") + // + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + + ".xml";//+"_"+date.getTime(); + filesCreated++; + console.log("Buiding sitemap in file: " + fileName + "\n"); + + let sitemap = "\n" + ""; + fs.writeFileSync("./" + fileName, sitemap); +} + +let filesCreated = 0; let fileName; let errorFileName; let noIndexFileName; const fs = require('fs'); +let promiseArray = []; let alreadyin = 0; // duplicate urls let notin= 0; let finalUrls = 0; let noIndexedUrls = 0; +let urlsWithPid = 0; let publications = 0; let datasets = 0; let software = 0; let other = 0; -let urlsWithPid = 0; // comment out for communities query check let communitiesPromise; @@ -362,10 +373,11 @@ let publicCommunities = []; const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json"; const landingPrefix = "https://explore.openaire.eu/search/"; -// const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0"; +// let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&page=0&size=0&type="; // comment out for communities query check -const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0"; +let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type="; const contextUrl = "https://services.openaire.eu/openaire/contexts/"; -buildSiteMap(150); +// process.argc[3] is the "resultType" argument +buildSiteMap(200, process.argv[3]); diff --git a/services/sitemaps/run.sh b/services/sitemaps/run.sh index 44b301b1..cd4becbc 100755 --- a/services/sitemaps/run.sh +++ b/services/sitemaps/run.sh @@ -1 +1 @@ -npx ts-node extractUrlsFromSearch.ts +npx ts-node extractUrlsFromSearch.ts -- $1