Merge branch 'master' of code-repo.d4science.org:MaDgIK/explore-services
This commit is contained in:
commit
488aaf4a2a
|
@ -4,3 +4,7 @@
|
||||||
**/.idea
|
**/.idea
|
||||||
**/dist
|
**/dist
|
||||||
**/uploads
|
**/uploads
|
||||||
|
services/sitemaps/*.xml
|
||||||
|
services/sitemaps/*.txt
|
||||||
|
services/sitemaps/sitemaps
|
||||||
|
services/sitemaps/sitemaps.zip
|
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,19 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/publications_mathematics_sitemap.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/datasets_mathematics_sitemap.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/software_mathematics_sitemap.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/other_mathematics_sitemap.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
</sitemapindex>
|
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,27 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/publications_physics_sitemap.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/publications_physics_sitemap1.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/publications_physics_sitemap2.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/datasets_physics_sitemap.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/software_physics_sitemap.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
<sitemap>
|
||||||
|
<loc>https://explore.openaire.eu/other_physics_sitemap.xml.gz</loc>
|
||||||
|
<lastmod>2021-11-22</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
</sitemapindex>
|
|
@ -2,10 +2,10 @@
|
||||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||||
<sitemap>
|
<sitemap>
|
||||||
<loc>https://explore.openaire.eu/publications_sitemap.xml.gz</loc>
|
<loc>https://explore.openaire.eu/publications_sitemap.xml.gz</loc>
|
||||||
<lastmod>2021-10-07</lastmod>
|
<lastmod>2021-11-22</lastmod>
|
||||||
</sitemap>
|
</sitemap>
|
||||||
<sitemap>
|
<sitemap>
|
||||||
<loc>https://explore.openaire.eu/publications_sitemap1.xml.gz</loc>
|
<loc>https://explore.openaire.eu/publications_sitemap1.xml.gz</loc>
|
||||||
<lastmod>2021-10-07</lastmod>
|
<lastmod>2021-11-22</lastmod>
|
||||||
</sitemap>
|
</sitemap>
|
||||||
</sitemapindex>
|
</sitemapindex>
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -13,6 +13,62 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent
|
||||||
|
|
||||||
const request = require('superagent');
|
const request = require('superagent');
|
||||||
|
|
||||||
|
function getForSubject(resultsPerUrl, resultsType, subject) {
|
||||||
|
setTimeout(async () => {
|
||||||
|
let allUrls = new Set();
|
||||||
|
|
||||||
|
let reqSubject: string = subjectMapping[subject];
|
||||||
|
if (reqSubject == null) {
|
||||||
|
reqSubject = subject;
|
||||||
|
}
|
||||||
|
|
||||||
|
let url = resultsUrlPrefix + "&query=(" + "resultsubject exact" + " \"" + encodeURIComponent(reqSubject) + "\")" + "&type=" + resultsType;
|
||||||
|
|
||||||
|
let totalResults: number = 150000;
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
request.get(url+"&size=0&page=0", function (err: any, response: any) {
|
||||||
|
if (!response && err) {
|
||||||
|
reject(err);
|
||||||
|
} else {
|
||||||
|
totalResults = response.body['meta']['total'];
|
||||||
|
resolve(value => value);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}).catch(error => {
|
||||||
|
console.error("Error getting results ", error);
|
||||||
|
fs.appendFileSync("./" + errorFileName, "no response " + url + " ");
|
||||||
|
fs.appendFileSync("./" + errorFileName, error);
|
||||||
|
fs.appendFileSync("./" + errorFileName, "\n");
|
||||||
|
})
|
||||||
|
|
||||||
|
let pages: number = Math.ceil(totalResults/resultsPerUrl);
|
||||||
|
console.log("totalResults="+totalResults + " - pages="+pages);
|
||||||
|
for(let page=0; page<pages; page++) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500));
|
||||||
|
|
||||||
|
promiseArray.push(
|
||||||
|
new Promise((resolve, reject) => {
|
||||||
|
request.get(url+"&size="+resultsPerUrl+"&page="+page, function (err: any, response: any) {
|
||||||
|
if (!response && err) {
|
||||||
|
reject(err);
|
||||||
|
} else {
|
||||||
|
parseAllUrls(response, allUrls, resultsType, subject);
|
||||||
|
resolve(value => value);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}).catch(error => {
|
||||||
|
console.error("Error getting results ", error);
|
||||||
|
fs.appendFileSync("./" + errorFileName, "no response " + url + " ");
|
||||||
|
fs.appendFileSync("./" + errorFileName, error);
|
||||||
|
fs.appendFileSync("./" + errorFileName, "\n");
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
finalize(resultsPerUrl, resultsType);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function get(resultsPerUrl, resultsType) {
|
function get(resultsPerUrl, resultsType) {
|
||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
let searchFields = new SearchFields();
|
let searchFields = new SearchFields();
|
||||||
|
@ -82,24 +138,28 @@ function get(resultsPerUrl, resultsType) {
|
||||||
console.log("");
|
console.log("");
|
||||||
}
|
}
|
||||||
|
|
||||||
await Promise.all(promiseArray);
|
finalize(resultsPerUrl, resultsType);
|
||||||
console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
|
|
||||||
console.log("\nNo indexed urls: "+noIndexedUrls + " vs final urls: "+finalUrls);
|
|
||||||
console.log("\nPublications: "+publications + " - Datasets: "+datasets +
|
|
||||||
" - Software: "+software + " - Other: "+other + " --- urls with pid: "+urlsWithPid);
|
|
||||||
|
|
||||||
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
|
||||||
if (err) {
|
|
||||||
return console.log("Error appending in file "+fileName+": ", err);
|
|
||||||
}
|
|
||||||
console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
// });
|
// });
|
||||||
|
|
||||||
|
async function finalize(resultsPerUrl, resultsType) {
|
||||||
|
await Promise.all(promiseArray);
|
||||||
|
console.log("\nDuplicate urls: " + alreadyin + " vs unique urls: " + notin);
|
||||||
|
console.log("\nNo indexed urls: " + noIndexedUrls + " vs final urls: " + finalUrls);
|
||||||
|
console.log("\nPublications: " + publications + " - Datasets: " + datasets +
|
||||||
|
" - Software: " + software + " - Other: " + other + " --- urls with pid: " + urlsWithPid);
|
||||||
|
|
||||||
|
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
||||||
|
if (err) {
|
||||||
|
return console.log("Error appending in file " + fileName + ": ", err);
|
||||||
|
}
|
||||||
|
console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function parseAllUrls_old(response: any, allUrls: any) {
|
function parseAllUrls_old(response: any, allUrls: any) {
|
||||||
// let allUrls: any = [];
|
// let allUrls: any = [];
|
||||||
|
|
||||||
|
@ -157,7 +217,7 @@ function parseAllUrls_old(response: any, allUrls: any) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function parseAllUrls(response: any, allUrls: any, resultsType: string) {
|
function parseAllUrls(response: any, allUrls: any, resultsType: string, subject: string = null) {
|
||||||
// let allUrls: any = [];
|
// let allUrls: any = [];
|
||||||
|
|
||||||
let responses: any = response.body['results'];
|
let responses: any = response.body['results'];
|
||||||
|
@ -210,7 +270,7 @@ function parseAllUrls(response: any, allUrls: any, resultsType: string) {
|
||||||
console.log("url to be added in file: "+url);
|
console.log("url to be added in file: "+url);
|
||||||
fs.appendFileSync("./" + fileName, "\n</urlset>");
|
fs.appendFileSync("./" + fileName, "\n</urlset>");
|
||||||
console.log("\n");
|
console.log("\n");
|
||||||
createSitemapFile(resultsType);
|
createSitemapFile(resultsType, subject);
|
||||||
}
|
}
|
||||||
|
|
||||||
finalUrls++;
|
finalUrls++;
|
||||||
|
@ -324,21 +384,25 @@ function getCommunities() {
|
||||||
}).catch(error => console.error("Error getting communities ", error));
|
}).catch(error => console.error("Error getting communities ", error));
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildSiteMap(resultsPerUrl, resultsType) {
|
function buildSiteMap(resultsPerUrl, resultsType, subject = null) {
|
||||||
console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||||
|
|
||||||
let date = new Date();
|
let date = new Date();
|
||||||
createSitemapFile(resultsType);
|
createSitemapFile(resultsType, subject);
|
||||||
errorFileName = resultsType + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
errorFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||||
noIndexFileName = resultsType + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
noIndexFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||||
|
|
||||||
getCommunities(); // comment out for communities query check
|
if(subject) {
|
||||||
get(resultsPerUrl, resultsType);
|
getForSubject(resultsPerUrl, resultsType, subject);
|
||||||
|
} else {
|
||||||
|
getCommunities(); // comment out for communities query check
|
||||||
|
get(resultsPerUrl, resultsType);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function createSitemapFile(resultsType) {
|
async function createSitemapFile(resultsType, subject=null) {
|
||||||
// let date = new Date();
|
// let date = new Date();
|
||||||
fileName = resultsType + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
|
fileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
|
||||||
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
|
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
|
||||||
+ ".xml";//+"_"+date.getTime();
|
+ ".xml";//+"_"+date.getTime();
|
||||||
filesCreated++;
|
filesCreated++;
|
||||||
|
@ -349,6 +413,39 @@ async function createSitemapFile(resultsType) {
|
||||||
fs.writeFileSync("./" + fileName, sitemap);
|
fs.writeFileSync("./" + fileName, sitemap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function askQuestion(query) {
|
||||||
|
const readline = require('readline');
|
||||||
|
|
||||||
|
const rl = readline.createInterface({
|
||||||
|
input: process.stdin,
|
||||||
|
output: process.stdout,
|
||||||
|
});
|
||||||
|
|
||||||
|
return new Promise(resolve => rl.question(query, ans => {
|
||||||
|
rl.close();
|
||||||
|
resolve(ans);
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async function start(resultsType, subject = null) {
|
||||||
|
if(resultsType == null) {
|
||||||
|
resultsType = await askQuestion("Please provide type of results (publications, datasets, software, other): ");
|
||||||
|
}
|
||||||
|
console.log("type is: " + resultsType);
|
||||||
|
|
||||||
|
if(subject == null) {
|
||||||
|
subject = await askQuestion("Please provide subject. " +
|
||||||
|
"Available subjects are \"Physics::Atomic Physics\" or physics, \"Mathematics::Combinatorics\" or mathematics, " +
|
||||||
|
"any other subject you want or no value if no subject: ");
|
||||||
|
if(!subject) {
|
||||||
|
subject = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log("subject is: " + subject);
|
||||||
|
|
||||||
|
buildSiteMap(200, resultsType, subject);
|
||||||
|
}
|
||||||
|
|
||||||
let filesCreated = 0;
|
let filesCreated = 0;
|
||||||
let fileName;
|
let fileName;
|
||||||
let errorFileName;
|
let errorFileName;
|
||||||
|
@ -379,5 +476,7 @@ const landingPrefix = "https://explore.openaire.eu/search/";
|
||||||
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
||||||
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
||||||
|
|
||||||
// process.argc[3] is the "resultType" argument
|
const subjectMapping = {"physics": "Physics::Atomic Physics", "mathematics": "Mathematics::Combinatorics"}
|
||||||
buildSiteMap(200, process.argv[3]);
|
|
||||||
|
// process.argc[3] is the "resultType" argument, process.argc[4] is the "subject" argument
|
||||||
|
start(process.argv[3], process.argv[4]);
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
npx ts-node extractUrlsFromSearch.ts -- $1
|
npx ts-node extractUrlsFromSearch.ts -- $1 $2
|
||||||
|
|
Loading…
Reference in New Issue