Merge branch 'master' of code-repo.d4science.org:MaDgIK/explore-services
This commit is contained in:
commit
97c79b5228
|
@ -4,3 +4,7 @@
|
|||
**/.idea
|
||||
**/dist
|
||||
**/uploads
|
||||
services/sitemaps/*.xml
|
||||
services/sitemaps/*.txt
|
||||
services/sitemaps/sitemaps
|
||||
services/sitemaps/sitemaps.zip
|
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,19 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/publications_mathematics_sitemap.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/datasets_mathematics_sitemap.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/software_mathematics_sitemap.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/other_mathematics_sitemap.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
</sitemapindex>
|
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,27 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/publications_physics_sitemap.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/publications_physics_sitemap1.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/publications_physics_sitemap2.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/datasets_physics_sitemap.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/software_physics_sitemap.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/other_physics_sitemap.xml.gz</loc>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
</sitemapindex>
|
|
@ -2,10 +2,10 @@
|
|||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/publications_sitemap.xml.gz</loc>
|
||||
<lastmod>2021-10-07</lastmod>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://explore.openaire.eu/publications_sitemap1.xml.gz</loc>
|
||||
<lastmod>2021-10-07</lastmod>
|
||||
<lastmod>2021-11-22</lastmod>
|
||||
</sitemap>
|
||||
</sitemapindex>
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -13,6 +13,62 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent
|
|||
|
||||
const request = require('superagent');
|
||||
|
||||
function getForSubject(resultsPerUrl, resultsType, subject) {
|
||||
setTimeout(async () => {
|
||||
let allUrls = new Set();
|
||||
|
||||
let reqSubject: string = subjectMapping[subject];
|
||||
if (reqSubject == null) {
|
||||
reqSubject = subject;
|
||||
}
|
||||
|
||||
let url = resultsUrlPrefix + "&query=(" + "resultsubject exact" + " \"" + encodeURIComponent(reqSubject) + "\")" + "&type=" + resultsType;
|
||||
|
||||
let totalResults: number = 150000;
|
||||
await new Promise((resolve, reject) => {
|
||||
request.get(url+"&size=0&page=0", function (err: any, response: any) {
|
||||
if (!response && err) {
|
||||
reject(err);
|
||||
} else {
|
||||
totalResults = response.body['meta']['total'];
|
||||
resolve(value => value);
|
||||
}
|
||||
})
|
||||
}).catch(error => {
|
||||
console.error("Error getting results ", error);
|
||||
fs.appendFileSync("./" + errorFileName, "no response " + url + " ");
|
||||
fs.appendFileSync("./" + errorFileName, error);
|
||||
fs.appendFileSync("./" + errorFileName, "\n");
|
||||
})
|
||||
|
||||
let pages: number = Math.ceil(totalResults/resultsPerUrl);
|
||||
console.log("totalResults="+totalResults + " - pages="+pages);
|
||||
for(let page=0; page<pages; page++) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
|
||||
promiseArray.push(
|
||||
new Promise((resolve, reject) => {
|
||||
request.get(url+"&size="+resultsPerUrl+"&page="+page, function (err: any, response: any) {
|
||||
if (!response && err) {
|
||||
reject(err);
|
||||
} else {
|
||||
parseAllUrls(response, allUrls, resultsType, subject);
|
||||
resolve(value => value);
|
||||
}
|
||||
})
|
||||
}).catch(error => {
|
||||
console.error("Error getting results ", error);
|
||||
fs.appendFileSync("./" + errorFileName, "no response " + url + " ");
|
||||
fs.appendFileSync("./" + errorFileName, error);
|
||||
fs.appendFileSync("./" + errorFileName, "\n");
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
finalize(resultsPerUrl, resultsType);
|
||||
});
|
||||
}
|
||||
|
||||
function get(resultsPerUrl, resultsType) {
|
||||
setTimeout(() => {
|
||||
let searchFields = new SearchFields();
|
||||
|
@ -82,24 +138,28 @@ function get(resultsPerUrl, resultsType) {
|
|||
console.log("");
|
||||
}
|
||||
|
||||
await Promise.all(promiseArray);
|
||||
console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
|
||||
console.log("\nNo indexed urls: "+noIndexedUrls + " vs final urls: "+finalUrls);
|
||||
console.log("\nPublications: "+publications + " - Datasets: "+datasets +
|
||||
" - Software: "+software + " - Other: "+other + " --- urls with pid: "+urlsWithPid);
|
||||
|
||||
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
||||
if (err) {
|
||||
return console.log("Error appending in file "+fileName+": ", err);
|
||||
}
|
||||
console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||
});
|
||||
finalize(resultsPerUrl, resultsType);
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
// });
|
||||
|
||||
async function finalize(resultsPerUrl, resultsType) {
|
||||
await Promise.all(promiseArray);
|
||||
console.log("\nDuplicate urls: " + alreadyin + " vs unique urls: " + notin);
|
||||
console.log("\nNo indexed urls: " + noIndexedUrls + " vs final urls: " + finalUrls);
|
||||
console.log("\nPublications: " + publications + " - Datasets: " + datasets +
|
||||
" - Software: " + software + " - Other: " + other + " --- urls with pid: " + urlsWithPid);
|
||||
|
||||
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
||||
if (err) {
|
||||
return console.log("Error appending in file " + fileName + ": ", err);
|
||||
}
|
||||
console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||
});
|
||||
}
|
||||
|
||||
function parseAllUrls_old(response: any, allUrls: any) {
|
||||
// let allUrls: any = [];
|
||||
|
||||
|
@ -157,7 +217,7 @@ function parseAllUrls_old(response: any, allUrls: any) {
|
|||
}
|
||||
|
||||
|
||||
function parseAllUrls(response: any, allUrls: any, resultsType: string) {
|
||||
function parseAllUrls(response: any, allUrls: any, resultsType: string, subject: string = null) {
|
||||
// let allUrls: any = [];
|
||||
|
||||
let responses: any = response.body['results'];
|
||||
|
@ -210,7 +270,7 @@ function parseAllUrls(response: any, allUrls: any, resultsType: string) {
|
|||
console.log("url to be added in file: "+url);
|
||||
fs.appendFileSync("./" + fileName, "\n</urlset>");
|
||||
console.log("\n");
|
||||
createSitemapFile(resultsType);
|
||||
createSitemapFile(resultsType, subject);
|
||||
}
|
||||
|
||||
finalUrls++;
|
||||
|
@ -324,21 +384,25 @@ function getCommunities() {
|
|||
}).catch(error => console.error("Error getting communities ", error));
|
||||
}
|
||||
|
||||
function buildSiteMap(resultsPerUrl, resultsType) {
|
||||
function buildSiteMap(resultsPerUrl, resultsType, subject = null) {
|
||||
console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||
|
||||
let date = new Date();
|
||||
createSitemapFile(resultsType);
|
||||
errorFileName = resultsType + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||
noIndexFileName = resultsType + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||
createSitemapFile(resultsType, subject);
|
||||
errorFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||
noIndexFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||
|
||||
getCommunities(); // comment out for communities query check
|
||||
get(resultsPerUrl, resultsType);
|
||||
if(subject) {
|
||||
getForSubject(resultsPerUrl, resultsType, subject);
|
||||
} else {
|
||||
getCommunities(); // comment out for communities query check
|
||||
get(resultsPerUrl, resultsType);
|
||||
}
|
||||
}
|
||||
|
||||
async function createSitemapFile(resultsType) {
|
||||
async function createSitemapFile(resultsType, subject=null) {
|
||||
// let date = new Date();
|
||||
fileName = resultsType + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
|
||||
fileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
|
||||
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
|
||||
+ ".xml";//+"_"+date.getTime();
|
||||
filesCreated++;
|
||||
|
@ -349,6 +413,39 @@ async function createSitemapFile(resultsType) {
|
|||
fs.writeFileSync("./" + fileName, sitemap);
|
||||
}
|
||||
|
||||
function askQuestion(query) {
|
||||
const readline = require('readline');
|
||||
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
});
|
||||
|
||||
return new Promise(resolve => rl.question(query, ans => {
|
||||
rl.close();
|
||||
resolve(ans);
|
||||
}))
|
||||
}
|
||||
|
||||
async function start(resultsType, subject = null) {
|
||||
if(resultsType == null) {
|
||||
resultsType = await askQuestion("Please provide type of results (publications, datasets, software, other): ");
|
||||
}
|
||||
console.log("type is: " + resultsType);
|
||||
|
||||
if(subject == null) {
|
||||
subject = await askQuestion("Please provide subject. " +
|
||||
"Available subjects are \"Physics::Atomic Physics\" or physics, \"Mathematics::Combinatorics\" or mathematics, " +
|
||||
"any other subject you want or no value if no subject: ");
|
||||
if(!subject) {
|
||||
subject = null;
|
||||
}
|
||||
}
|
||||
console.log("subject is: " + subject);
|
||||
|
||||
buildSiteMap(200, resultsType, subject);
|
||||
}
|
||||
|
||||
let filesCreated = 0;
|
||||
let fileName;
|
||||
let errorFileName;
|
||||
|
@ -379,5 +476,7 @@ const landingPrefix = "https://explore.openaire.eu/search/";
|
|||
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
||||
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
||||
|
||||
// process.argc[3] is the "resultType" argument
|
||||
buildSiteMap(200, process.argv[3]);
|
||||
const subjectMapping = {"physics": "Physics::Atomic Physics", "mathematics": "Mathematics::Combinatorics"}
|
||||
|
||||
// process.argc[3] is the "resultType" argument, process.argc[4] is the "subject" argument
|
||||
start(process.argv[3], process.argv[4]);
|
||||
|
|
|
@ -1 +1 @@
|
|||
npx ts-node extractUrlsFromSearch.ts -- $1
|
||||
npx ts-node extractUrlsFromSearch.ts -- $1 $2
|
||||
|
|
Loading…
Reference in New Issue