diff --git a/services/sitemaps/extractUrlsFromSearch.ts b/services/sitemaps/extractUrlsFromSearch.ts
index aebba9ab..6c86a3c2 100644
--- a/services/sitemaps/extractUrlsFromSearch.ts
+++ b/services/sitemaps/extractUrlsFromSearch.ts
@@ -13,11 +13,13 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent
const request = require('superagent');
-function get(resultsPerUrl) {
+function get(resultsPerUrl, resultsType) {
setTimeout(() => {
let searchFields = new SearchFields();
let fieldIdsMap = searchFields.RESULT_FIELDS;
+ refineUrl += resultsType;
+
request.get(refineUrl, async function (err: any, refineResponse: any) {
if (!refineResponse && err) {
console.error("Error getting refine filters ",err);
@@ -27,8 +29,6 @@ function get(resultsPerUrl) {
let allUrls = new Set();
- let promiseArray = [];
-
for (let key of keys) {
// comment out for communities query check
if(key == "community") {
@@ -59,7 +59,7 @@ function get(resultsPerUrl) {
}
}
- const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl;
+ const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=" + resultsType + "&page=0&size=" + resultsPerUrl;
await new Promise(resolve => setTimeout(resolve, 500));
promiseArray.push(
@@ -68,7 +68,7 @@ function get(resultsPerUrl) {
if (!response && err) {
reject(err);
} else {
- parseAllUrls(response, allUrls);
+ parseAllUrls(response, allUrls, resultsType);
resolve(value => value);
}
})
@@ -92,7 +92,7 @@ function get(resultsPerUrl) {
if (err) {
return console.log("Error appending in file "+fileName+": ", err);
}
- console.timeEnd("total_time");
+ console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
});
}
})
@@ -157,7 +157,7 @@ function parseAllUrls_old(response: any, allUrls: any) {
}
-function parseAllUrls(response: any, allUrls: any) {
+function parseAllUrls(response: any, allUrls: any, resultsType: string) {
// let allUrls: any = [];
let responses: any = response.body['results'];
@@ -206,6 +206,13 @@ function parseAllUrls(response: any, allUrls: any) {
noIndexedUrls++;
fs.appendFileSync("./"+noIndexFileName, url+"\n");
} else {
+ if(finalUrls > 0 && ((finalUrls % 50000) == 0)) {
+ console.log("url to be added in file: "+url);
+ fs.appendFileSync("./" + fileName, "\n");
+ console.log("\n");
+ createSitemapFile(resultsType);
+ }
+
finalUrls++;
let urlPre = "\n";
let urlSuf = "";
@@ -317,43 +324,47 @@ function getCommunities() {
}).catch(error => console.error("Error getting communities ", error));
}
-function buildSiteMap(resultsPerUrl) {
- console.time("total_time");
+function buildSiteMap(resultsPerUrl, resultsType) {
+ console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
let date = new Date();
- fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
- errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
- noIndexFileName = "noIndex_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
- console.log("Buiding sitemap in file: "+fileName+"\n");
-
- let sitemap = "\n" +
- "";
-
- fs.writeFile("./"+fileName, sitemap, function(err) {
- if(err) {
- return console.log("Error writing in file "+fileName+": ", err);
- }
- });
+ createSitemapFile(resultsType);
+ errorFileName = resultsType + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
+ noIndexFileName = resultsType + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
getCommunities(); // comment out for communities query check
- get(resultsPerUrl);
+ get(resultsPerUrl, resultsType);
}
+async function createSitemapFile(resultsType) {
+ // let date = new Date();
+ fileName = resultsType + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
+ // + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
+ + ".xml";//+"_"+date.getTime();
+ filesCreated++;
+ console.log("Buiding sitemap in file: " + fileName + "\n");
+
+ let sitemap = "\n" + "";
+ fs.writeFileSync("./" + fileName, sitemap);
+}
+
+let filesCreated = 0;
let fileName;
let errorFileName;
let noIndexFileName;
const fs = require('fs');
+let promiseArray = [];
let alreadyin = 0; // duplicate urls
let notin= 0;
let finalUrls = 0;
let noIndexedUrls = 0;
+let urlsWithPid = 0;
let publications = 0;
let datasets = 0;
let software = 0;
let other = 0;
-let urlsWithPid = 0;
// comment out for communities query check
let communitiesPromise;
@@ -362,10 +373,11 @@ let publicCommunities = [];
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
const landingPrefix = "https://explore.openaire.eu/search/";
-// const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
+// let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
// comment out for communities query check
-const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
+let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
-buildSiteMap(150);
+// process.argc[3] is the "resultType" argument
+buildSiteMap(200, process.argv[3]);
diff --git a/services/sitemaps/run.sh b/services/sitemaps/run.sh
index 44b301b1..cd4becbc 100755
--- a/services/sitemaps/run.sh
+++ b/services/sitemaps/run.sh
@@ -1 +1 @@
-npx ts-node extractUrlsFromSearch.ts
+npx ts-node extractUrlsFromSearch.ts -- $1