[Explore Service | Services]: 1. extractUrlsFromSearch.ts: a. Do not query for results but for parametrized "resultType".
b. When final urls > 50000, create new files every 50000. 2. run.sh: Added $1 for argument "resultType".
This commit is contained in:
parent
f50db71c44
commit
6350a9f550
|
@ -13,11 +13,13 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent
|
||||||
|
|
||||||
const request = require('superagent');
|
const request = require('superagent');
|
||||||
|
|
||||||
function get(resultsPerUrl) {
|
function get(resultsPerUrl, resultsType) {
|
||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
let searchFields = new SearchFields();
|
let searchFields = new SearchFields();
|
||||||
let fieldIdsMap = searchFields.RESULT_FIELDS;
|
let fieldIdsMap = searchFields.RESULT_FIELDS;
|
||||||
|
|
||||||
|
refineUrl += resultsType;
|
||||||
|
|
||||||
request.get(refineUrl, async function (err: any, refineResponse: any) {
|
request.get(refineUrl, async function (err: any, refineResponse: any) {
|
||||||
if (!refineResponse && err) {
|
if (!refineResponse && err) {
|
||||||
console.error("Error getting refine filters ",err);
|
console.error("Error getting refine filters ",err);
|
||||||
|
@ -27,8 +29,6 @@ function get(resultsPerUrl) {
|
||||||
|
|
||||||
let allUrls = new Set();
|
let allUrls = new Set();
|
||||||
|
|
||||||
let promiseArray = [];
|
|
||||||
|
|
||||||
for (let key of keys) {
|
for (let key of keys) {
|
||||||
// comment out for communities query check
|
// comment out for communities query check
|
||||||
if(key == "community") {
|
if(key == "community") {
|
||||||
|
@ -59,7 +59,7 @@ function get(resultsPerUrl) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl;
|
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=" + resultsType + "&page=0&size=" + resultsPerUrl;
|
||||||
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 500));
|
await new Promise(resolve => setTimeout(resolve, 500));
|
||||||
promiseArray.push(
|
promiseArray.push(
|
||||||
|
@ -68,7 +68,7 @@ function get(resultsPerUrl) {
|
||||||
if (!response && err) {
|
if (!response && err) {
|
||||||
reject(err);
|
reject(err);
|
||||||
} else {
|
} else {
|
||||||
parseAllUrls(response, allUrls);
|
parseAllUrls(response, allUrls, resultsType);
|
||||||
resolve(value => value);
|
resolve(value => value);
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
@ -92,7 +92,7 @@ function get(resultsPerUrl) {
|
||||||
if (err) {
|
if (err) {
|
||||||
return console.log("Error appending in file "+fileName+": ", err);
|
return console.log("Error appending in file "+fileName+": ", err);
|
||||||
}
|
}
|
||||||
console.timeEnd("total_time");
|
console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
@ -157,7 +157,7 @@ function parseAllUrls_old(response: any, allUrls: any) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function parseAllUrls(response: any, allUrls: any) {
|
function parseAllUrls(response: any, allUrls: any, resultsType: string) {
|
||||||
// let allUrls: any = [];
|
// let allUrls: any = [];
|
||||||
|
|
||||||
let responses: any = response.body['results'];
|
let responses: any = response.body['results'];
|
||||||
|
@ -206,6 +206,13 @@ function parseAllUrls(response: any, allUrls: any) {
|
||||||
noIndexedUrls++;
|
noIndexedUrls++;
|
||||||
fs.appendFileSync("./"+noIndexFileName, url+"\n");
|
fs.appendFileSync("./"+noIndexFileName, url+"\n");
|
||||||
} else {
|
} else {
|
||||||
|
if(finalUrls > 0 && ((finalUrls % 50000) == 0)) {
|
||||||
|
console.log("url to be added in file: "+url);
|
||||||
|
fs.appendFileSync("./" + fileName, "\n</urlset>");
|
||||||
|
console.log("\n");
|
||||||
|
createSitemapFile(resultsType);
|
||||||
|
}
|
||||||
|
|
||||||
finalUrls++;
|
finalUrls++;
|
||||||
let urlPre = "\n<url><loc>";
|
let urlPre = "\n<url><loc>";
|
||||||
let urlSuf = "</loc></url>";
|
let urlSuf = "</loc></url>";
|
||||||
|
@ -317,43 +324,47 @@ function getCommunities() {
|
||||||
}).catch(error => console.error("Error getting communities ", error));
|
}).catch(error => console.error("Error getting communities ", error));
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildSiteMap(resultsPerUrl) {
|
function buildSiteMap(resultsPerUrl, resultsType) {
|
||||||
console.time("total_time");
|
console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||||
|
|
||||||
let date = new Date();
|
let date = new Date();
|
||||||
fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
|
createSitemapFile(resultsType);
|
||||||
errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
|
errorFileName = resultsType + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||||
noIndexFileName = "noIndex_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
|
noIndexFileName = resultsType + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||||
console.log("Buiding sitemap in file: "+fileName+"\n");
|
|
||||||
|
|
||||||
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
|
|
||||||
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
|
|
||||||
|
|
||||||
fs.writeFile("./"+fileName, sitemap, function(err) {
|
|
||||||
if(err) {
|
|
||||||
return console.log("Error writing in file "+fileName+": ", err);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
getCommunities(); // comment out for communities query check
|
getCommunities(); // comment out for communities query check
|
||||||
get(resultsPerUrl);
|
get(resultsPerUrl, resultsType);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function createSitemapFile(resultsType) {
|
||||||
|
// let date = new Date();
|
||||||
|
fileName = resultsType + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
|
||||||
|
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
|
||||||
|
+ ".xml";//+"_"+date.getTime();
|
||||||
|
filesCreated++;
|
||||||
|
|
||||||
|
console.log("Buiding sitemap in file: " + fileName + "\n");
|
||||||
|
|
||||||
|
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
|
||||||
|
fs.writeFileSync("./" + fileName, sitemap);
|
||||||
|
}
|
||||||
|
|
||||||
|
let filesCreated = 0;
|
||||||
let fileName;
|
let fileName;
|
||||||
let errorFileName;
|
let errorFileName;
|
||||||
let noIndexFileName;
|
let noIndexFileName;
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
|
|
||||||
|
let promiseArray = [];
|
||||||
let alreadyin = 0; // duplicate urls
|
let alreadyin = 0; // duplicate urls
|
||||||
let notin= 0;
|
let notin= 0;
|
||||||
let finalUrls = 0;
|
let finalUrls = 0;
|
||||||
let noIndexedUrls = 0;
|
let noIndexedUrls = 0;
|
||||||
|
let urlsWithPid = 0;
|
||||||
let publications = 0;
|
let publications = 0;
|
||||||
let datasets = 0;
|
let datasets = 0;
|
||||||
let software = 0;
|
let software = 0;
|
||||||
let other = 0;
|
let other = 0;
|
||||||
let urlsWithPid = 0;
|
|
||||||
|
|
||||||
// comment out for communities query check
|
// comment out for communities query check
|
||||||
let communitiesPromise;
|
let communitiesPromise;
|
||||||
|
@ -362,10 +373,11 @@ let publicCommunities = [];
|
||||||
|
|
||||||
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
|
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
|
||||||
const landingPrefix = "https://explore.openaire.eu/search/";
|
const landingPrefix = "https://explore.openaire.eu/search/";
|
||||||
// const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
// let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
||||||
|
|
||||||
// comment out for communities query check
|
// comment out for communities query check
|
||||||
const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
||||||
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
||||||
|
|
||||||
buildSiteMap(150);
|
// process.argc[3] is the "resultType" argument
|
||||||
|
buildSiteMap(200, process.argv[3]);
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
npx ts-node extractUrlsFromSearch.ts
|
npx ts-node extractUrlsFromSearch.ts -- $1
|
||||||
|
|
Loading…
Reference in New Issue