diff --git a/CMakeLists.txt b/CMakeLists.txt index 58da14fa7fb1a2de6a35014ed5c70f31d9bee74e..ede5fc4f9c5ef103fe3665abb58b1f8069f02bdb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,14 @@ cmake_minimum_required(VERSION 3.20) -project(scipaper LANGUAGES C) +project(scipaper LANGUAGES C CXX) set(SCI_SYSCONF_DIR /usr/share/scipaper) set(SCI_MODULE_DIR /usr/lib/scipaper/modules) +set(SCI_USERCONF_DIR .config/scipaper) add_definitions(-D_GNU_SOURCE) add_definitions(-DSCI_MODULE_DIR=${SCI_MODULE_DIR}) add_definitions(-DSCI_SYSCONF_DIR=${SCI_SYSCONF_DIR}) +add_definitions(-DSCI_USERCONF_DIR=${SCI_USERCONF_DIR}) add_definitions(-DSCI_SYSCONF_INI=scipaper.ini) find_package(Doxygen) @@ -17,6 +19,9 @@ pkg_search_module(GLIB REQUIRED glib-2.0) pkg_search_module(GMODULE REQUIRED gmodule-2.0) pkg_check_modules(CURL REQUIRED libcurl) +set(CXX_STANDARD 11) +set(C_STANDARD 99) + set(COMMON_INCLUDE_DIRS ${GLIB_INCLUDE_DIRS} ${GMODULE_INCLUDE_DIRS} @@ -26,7 +31,7 @@ set(COMMON_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/src/modapi) message(${COMMON_INCLUDE_DIRS}) set(COMMON_LIBRARIES ${GLIB_LIBRARIES} ${GMODULE_LIBRARIES} ${CURL_LIBRARIES}) -set(COMMON_FLAGS "-std=c99 -Wall -O2 -march=native -g -fno-strict-aliasing") +set(COMMON_FLAGS "-Wall -O2 -march=native -g -fno-strict-aliasing") set(CMAKE_INSTALL_PREFIX "/usr") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 142a189dcbffa7784b8bdd9620894e3a73400734..c693d532578d5dde485579c02aa4771aaea299a5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,7 @@ set_target_properties(${PROJECT_NAME} PROPERTIES COMPILE_FLAGS ${COMMON_FLAGS}) install(TARGETS ${PROJECT_NAME} DESTINATION lib) link_directories(${CMAKE_CURRENT_BINARY_DIR}) -set(SRC_FILES_TEST_APP main.c) +set(SRC_FILES_TEST_APP main.cpp log.cpp) set(LIBS_CLI -L. -l${PROJECT_NAME}) add_executable(${PROJECT_NAME}_cli ${SRC_FILES_TEST_APP}) add_dependencies(${PROJECT_NAME}_cli ${PROJECT_NAME}) diff --git a/src/log.cpp b/src/log.cpp new file mode 100644 index 0000000000000000000000000000000000000000..05a31102b071f3aaa7632e9a5a15fb05256c0cb7 --- /dev/null +++ b/src/log.cpp @@ -0,0 +1,66 @@ +/** +* libscipaper +* Copyright (C) 2022-2023 Carl Klemm +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* version 3 as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the +* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +* Boston, MA 02110-1301, USA. +*/ + +#include "log.h" + +Log::Log(Level type, bool endlineI): endline(endlineI) +{ + msglevel = type; + if(headers) + { + operator << ("["+getLabel(type)+"] "); + } +} + +Log::~Log() +{ + if(opened && endline) + { + std::cout<<'\n'; + } + opened = false; +} + + +std::string Log::getLabel(Level level) +{ + std::string label; + switch(level) + { + case SUPERDEBUG: + case DEBUG: + label = "DEBUG"; + break; + case EXTRA: + label = "EXTRA"; + case INFO: + label = "INFO "; + break; + case WARN: + label = "WARN "; + break; + case ERROR: + label = "ERROR"; + break; + } + return label; +} + +bool Log::headers = false; +Log::Level Log::level = WARN; diff --git a/src/log.h b/src/log.h new file mode 100644 index 0000000000000000000000000000000000000000..c9fd7f51c54fc7352b1d8a6ac20cbc25a44aa717 --- /dev/null +++ b/src/log.h @@ -0,0 +1,63 @@ +/** +* autobiblatex +* Copyright (C) 2022 Carl Klemm +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* version 3 as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the +* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +* Boston, MA 02110-1301, USA. +*/ + +#pragma once +#include <iostream> +#include <string> + +class Log +{ +public: + + enum Level + { + SUPERDEBUG, + DEBUG, + EXTRA, + INFO, + WARN, + ERROR + }; + +private: + bool opened = false; + Level msglevel = DEBUG; + bool endline = true; + + std::string getLabel(Level level); + +public: + + static bool headers; + static Level level; + + Log() {} + Log(Level type, bool endlineI = true); + ~Log(); + + template<class T> Log &operator<<(const T &msg) + { + if(msglevel >= level) + { + std::cout<<msg; + opened = true; + } + return *this; + } +}; diff --git a/src/main.c b/src/main.c deleted file mode 100644 index 955f553777a656a1e6b8b037a6f2982db20a6cbd..0000000000000000000000000000000000000000 --- a/src/main.c +++ /dev/null @@ -1,175 +0,0 @@ -/* - * main.c - * Copyright (C) Carl Philipp Klemm 2021 <carl@uvos.xyz> - * - * main.c is free software: you can redistribute it and/or modify it - * under the terms of the lesser GNU General Public License as published by the - * Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * main.c is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the lesser GNU General Public License along - * with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <stdio.h> -#include <stdbool.h> -#include <glib.h> -#include "scipaper.h" - -static void print_documents(const RequestReturn* documents) -{ - printf("Found %zu documents:\n", documents->count); - for(size_t i = 0; i < documents->count; ++i) - { - if(!documents->documents[i]) - continue; - char* documentString = document_meta_get_string(documents->documents[i]); - printf("Document found by %s:\n%s", sci_get_backend_info(documents->documents[i]->backendId)->name, documentString); - free(documentString); - } -} - -static void search_and_grab_wallauer_via_core() -{ - printf("Starting %s\n", __func__); - int id = sci_backend_get_id_by_name("core"); - if(id == 0) - { - puts("core backend not available"); - return; - } - - DocumentMeta* queryMeta = document_meta_new(); - queryMeta->author = g_strdup("Wallauer"); - queryMeta->hasFullText = true; - queryMeta->backendId = id; - RequestReturn* documents = sci_fill_meta(queryMeta, NULL, 20, 0); - document_meta_free(queryMeta); - - if(documents) - { - print_documents(documents); - - printf("Getting text for first document from: %s (%i)\n", - sci_get_backend_name(documents->documents[0]->backendId), documents->documents[0]->backendId); - char* text = sci_get_document_text(documents->documents[0]); - if(text) - puts("got text!"); - free(text); - - for(size_t i = 0; i < documents->count; ++i) - { - PdfData* data = sci_get_document_pdf_data(documents->documents[i]); - if(data) - { - puts("got got data! saveing.."); - char* fileName = g_strdup_printf("./%zu.pdf", i); - bool ret = sci_save_pdf_to_file(data, fileName); - g_free(fileName); - if(ret) - puts("saved"); - else - puts("not saved"); - } - } - request_return_free(documents); - } - else - { - puts("Could not find any documents that matched query"); - } - -} - -static void search_wallauer(void) -{ - printf("Starting %s\n", __func__); - DocumentMeta* queryMeta = document_meta_new(); - queryMeta->author = g_strdup("Wallauer"); - - RequestReturn* documents = sci_fill_meta(queryMeta, NULL, 20, 0); - - document_meta_free(queryMeta); - - if(documents) - { - print_documents(documents); - request_return_free(documents); - } - else - { - puts("Could not find any documents that matched query"); - } - puts(""); -} - -static void fill_meta_by_doi(void) -{ - printf("Starting %s\n", __func__); - DocumentMeta* meta = sci_find_by_doi("10.1002/ange.19410544309", 0); - - if(meta) - { - char* documentString = document_meta_get_string(meta); - printf("Found document for 10.1002/ange.19410544309:\n%s", documentString); - free(documentString); - printf("Trying grab scihub\n"); - meta->backendId = 0; - PdfData* pdfData = sci_get_document_pdf_data(meta); - if(pdfData) - { - puts("Found pdf for 10.1002/ange.19410544309"); - sci_save_pdf_to_file(pdfData, "out.pdf"); - pdf_data_free(pdfData); - } - else - { - puts("unable to grab pdf"); - } - } - else - { - puts("Could not find any documents that matched doi"); - } - - document_meta_free(meta); -} - -int main(int argc, char** argv) -{ - const char* configFileName = NULL; - if(argc > 1) - { - configFileName = argv[1]; - printf("%s using config file %s\n", __func__, configFileName); - } - - sci_log_set_verbosity(LL_DEBUG); - - if(!sci_paper_init(configFileName, NULL, 0)) - { - printf("Coult not init libscipaper"); - return 1; - } - - puts("Backends available:"); - const BackendInfo** backends = sci_get_all_backends(); - for(size_t i = 0; backends[i]; ++i) - { - char* cap = capability_flags_get_str(backends[i]->capabilities); - printf("%zu:\t%s\tCapabilities: %s\n", i, backends[i]->name, cap); - free(cap); - } - - //search_wallauer(); - fill_meta_by_doi(); - //search_and_grab_wallauer_via_core(); - - sci_paper_exit(); - return 0; -} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..10c13d69b2881c531deefb18d9844398ffed01e2 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,158 @@ +/** +* libscipaper +* Copyright (C) 2023 Carl Klemm +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* version 3 as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the +* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +* Boston, MA 02110-1301, USA. +*/ + +#include <iostream> +#include <scipaper/scipaper.h> +#include <algorithm> +#include <cassert> + +#include "log.h" +#include "options.h" + +static constexpr size_t resultsPerPage = 200; + +bool grabPapers(const DocumentMeta* meta, bool dryRun, bool savePdf, bool saveText, const std::filesystem::path& outDir, size_t maxCount) +{ + Log(Log::INFO)<<"Downloading results"; + RequestReturn* req = sci_fill_meta(meta, nullptr, std::min(maxCount, resultsPerPage), 0); + bool retried = false; + if(req) + { + size_t pages = req->totalCount/resultsPerPage; + size_t totalCount = req->totalCount; + + Log(Log::INFO)<<"Got "<<totalCount<<" results in "<<pages<<" pages"; + + if(dryRun) + { + request_return_free(req); + return true; + } + + size_t processed = 0; + for(size_t page = 0; page <= pages; ++page) + { + if(page != 0) + req = sci_fill_meta(meta, nullptr, resultsPerPage, page); + if(!req) + { + if(!retried) + --page; + retried = true; + continue; + } + else + { + retried = false; + } + + Log(Log::INFO)<<"Processing page "<<page<<": "<<processed<<" of "<<std::min(maxCount, req->totalCount)<< + ", got "<<req->count<<" results this page"; + for(size_t i = 0; i < req->count; ++i) + { + if(req->documents[i]) + { + std::filesystem::path jsonpath = outDir/(std::to_string(page*resultsPerPage+i) + ".json"); + + if(savePdf) + { + std::filesystem::path pdfpath = outDir/(std::to_string(page*resultsPerPage+i) + ".pdf"); + bool ret = sci_save_document_to_file(req->documents[i], pdfpath.c_str()); + if(!ret) + Log(Log::WARN)<<"Could not get pdf for document "<<jsonpath; + } + + char* text = nullptr; + if(saveText) + { + text = sci_get_document_text(req->documents[i]); + if(!text) + Log(Log::WARN)<<"Could not get text for document "<<jsonpath; + } + + bool ret = document_meta_save(jsonpath.c_str(), req->documents[i], text); + if(!ret) + Log(Log::WARN)<<"Could not save document metadata"<<jsonpath; + } + ++processed; + if(maxCount > 0 && processed >= maxCount) + break; + } + request_return_free(req); + if(maxCount > 0 && processed >= maxCount) + break; + } + return true; + } + + Log(Log::WARN)<<"The backend found no results for your query"; + return false; +} + +bool checkDir(const std::filesystem::path& outDir) +{ + if(!std::filesystem::is_directory(outDir)) + { + if(!std::filesystem::create_directory(outDir)) + { + std::cerr<<outDir<<" dose not exist and can not be created\n"; + return false; + } + } + return true; +} + +int main(int argc, char** argv) +{ + Log::level = Log::INFO; + Config config; + argp_parse(&argp, argc, argv, 0, 0, &config); + + if(Log::level == Log::DEBUG) + sci_log_set_verbosity(LL_DEBUG); + + if(!sci_paper_init(nullptr, nullptr, 0)) + { + Log(Log::ERROR)<<"could not init scipaper"; + return 1; + } + + bool ret = checkDir(config.outDir); + if(!ret) + return 1; + + DocumentMeta queryMeta = { + .doi = const_cast<char*>(config.doi.empty() ? nullptr : config.doi.c_str()), + .title = const_cast<char*>(config.title.empty() ? nullptr : config.title.c_str()), + .journal = const_cast<char*>(config.journal.empty() ? nullptr : config.journal.c_str()), + .keywords = const_cast<char*>(config.keywords.empty() ? nullptr : config.keywords.c_str()), + .abstract = const_cast<char*>(config.abstract.empty() ? nullptr : config.abstract.c_str()), + .searchText = const_cast<char*>(config.text.empty() ? nullptr : config.text.c_str()), + .hasFullText = true + }; + + size_t length; + char* json = document_meta_get_json(&queryMeta, nullptr, &length); + Log(Log::DEBUG)<<"Using document meta: "<<json; + free(json); + ret = grabPapers(&queryMeta, config.dryRun, config.savePdf, config.fullText, config.outDir, config.maxNumber); + if(!ret) + return 1; + return 0; +} diff --git a/src/modules/core.c b/src/modules/core.c index a186793a5c83b92de81de13942267c417886a308..47d53e2ab21f85b0796511780c4ba1cc628819d9 100644 --- a/src/modules/core.c +++ b/src/modules/core.c @@ -235,6 +235,12 @@ static RequestReturn* core_fill_meta_impl(int *code, const DocumentMeta* meta, s } g_free(tokens); } + if(meta->abstract) + { + g_string_append(searchString, "abstract:\""); + g_string_append(searchString, meta->abstract); + g_string_append(searchString, "\"+"); + } if(meta->searchText) { g_string_append_c(searchString, '\"'); @@ -315,7 +321,7 @@ static RequestReturn* core_fill_meta(const DocumentMeta* meta, size_t maxCount, return NULL; } - if(meta->author || meta->title || meta->keywords || meta->searchText) + if(meta->author || meta->title || meta->keywords || meta->searchText || meta->abstract) { int code = -1; for(int i = 0; i < priv->retry && code != 0; ++i) @@ -325,6 +331,10 @@ static RequestReturn* core_fill_meta(const DocumentMeta* meta, size_t maxCount, results = core_fill_meta_impl(&code, meta, maxCount, page, priv); } } + else + { + sci_module_log(LL_DEBUG, "Can not fill meta that dose not contain author, title, keywords, abstract or searchText"); + } return results; } diff --git a/src/options.h b/src/options.h new file mode 100644 index 0000000000000000000000000000000000000000..a2db02f264ddd3043b1c80d78afb92bd917d6f23 --- /dev/null +++ b/src/options.h @@ -0,0 +1,123 @@ +/** +* papergrabber +* Copyright (C) 2023 Carl Klemm +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* version 3 as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the +* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +* Boston, MA 02110-1301, USA. +*/ + +#pragma once +#include <string> +#include <vector> +#include <argp.h> +#include <iostream> +#include <filesystem> +#include "log.h" + +const char *argp_program_version = "1.0"; +const char *argp_program_bug_address = "<carl@uvos.xyz>"; +static char doc[] = "Application that grabs text or pdf files for documents from online resources using libscipaper"; +static char args_doc[] = ""; + +static struct argp_option options[] = +{ + {"verbose", 'v', 0, 0, "Show debug messages" }, + {"question", 'q', "[FILE]", 0, "Question you wan the system to awnser" }, + {"key-words", 'k', "[FILE]", 0, "Search in key words" }, + {"title", 't', "[STRING]",0, "Search in title"}, + {"jornal", 'j', "[STRING]",0, "Search in journal"}, + {"abstract", 'a', "[STRING]",0, "Search in abstract"}, + {"text", 'e', "[STRING]",0, "Freeform text search"}, + {"doi", 'i', "[STRING]",0, "Search for a specific doi" }, + {"dry-run", 'd', 0, 0, "Just show how manny results there are"}, + {"out-dir", 'o', "[DIRECTORY]", 0, "Place to save output" }, + {"limit", 'l', "[NUMBER]", 0, "Maximum number of results to process" }, + {"pdf", 'p', 0, 0, "Save pdf"}, + {"full-text", 'f', 0, 0, "Save full text"}, + {"backend", 'b', 0, 0, "Ask scipaper to use a specific backend"}, + { 0 } +}; + +struct Config +{ + std::string keywords; + std::string title; + std::string journal; + std::string abstract; + std::string text; + std::string question; + std::string doi; + std::string backend; + std::filesystem::path outDir = "./out"; + size_t maxNumber = 100; + bool dryRun = false; + bool fullText = false; + bool savePdf = false; +}; + +static error_t parse_opt (int key, char *arg, struct argp_state *state) +{ + Config *config = reinterpret_cast<Config*>(state->input); + + switch (key) + { + case 'v': + Log::level = Log::DEBUG; + break; + case 'q': + config->question.assign(arg); + break; + case 'k': + config->keywords.assign(arg); + break; + case 't': + config->title.assign(arg); + break; + case 'o': + config->outDir.assign(arg); + break; + case 'j': + config->journal.assign(arg); + break; + case 'a': + config->abstract.assign(arg); + break; + case 'e': + config->text.assign(arg); + break; + case 'd': + config->dryRun = true; + break; + case 'l': + config->maxNumber = stoll(std::string(arg)); + break; + case 'b': + config->backend.assign(arg); + break; + case 'i': + config->doi.assign(arg); + break; + case 'f': + config->fullText = true; + break; + case 'p': + config->savePdf = true; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static struct argp argp = {options, parse_opt, args_doc, doc}; diff --git a/src/sci-backend.c b/src/sci-backend.c index b99ceaa70533e4348bbcbdf4e297506b3dea6296..465796896600f14c6c91f1966d54f1d58ca4f4df 100644 --- a/src/sci-backend.c +++ b/src/sci-backend.c @@ -213,6 +213,7 @@ RequestReturn* sci_fill_meta(const DocumentMeta* meta, const FillReqest* fill, s struct SciBackend* backend = element->data; if(backend->fill_meta && (meta->backendId == backend->id || meta->backendId == 0)) { + sci_log(LL_DEBUG, "%s: Trying to fill using %s", __func__, backend->backend_info->name); RequestReturn* newMetas = backend->fill_meta(meta, maxCount, page, backend->user_data); if(newMetas) { diff --git a/src/sci-conf.c b/src/sci-conf.c index b2c5583d21002b9220ea4431c024ef219a0032e2..3dc1b54c6b167ef532494a62fe4e031461f2fafd 100644 --- a/src/sci-conf.c +++ b/src/sci-conf.c @@ -324,10 +324,14 @@ static bool sci_conf_is_ini_file(const char *filename) bool sci_conf_init(const char* fileName, const char* data, size_t length) { sci_conf_file_count = 1; + char* home = getenv("HOME"); + if(fileName) ++sci_conf_file_count; if(data) ++sci_conf_file_count; + if(home) + ++sci_conf_file_count; size_t index = 0; conf_files = calloc(sci_conf_file_count, sizeof(*conf_files)); @@ -349,6 +353,24 @@ bool sci_conf_init(const char* fileName, const char* data, size_t length) ++index; } + if(home) + { + conf_files[index].filename = g_strdup(G_STRINGIFY(SCI_SYSCONF_INI)); + conf_files[index].path = g_strconcat(home, "/", G_STRINGIFY(SCI_USERCONF_DIR), "/", G_STRINGIFY(SCI_SYSCONF_INI), NULL); + gpointer conf_file = sci_conf_read_conf_file(conf_files[index].path); + if(!conf_file) + { + g_free(conf_files[index].filename); + g_free(conf_files[index].path); + --sci_conf_file_count; + } + else + { + conf_files[index].keyfile = conf_file; + ++index; + } + } + if(fileName) { if(sci_conf_is_ini_file(fileName)) @@ -395,7 +417,7 @@ bool sci_conf_init(const char* fileName, const char* data, size_t length) return FALSE; for (size_t i = 0; i < sci_conf_file_count; ++i) - sci_log(LL_DEBUG, "sci-conf: using conf file %lu: %s", (unsigned long)i, conf_files[i].filename); + sci_log(LL_DEBUG, "sci-conf: using conf file %lu: %s", (unsigned long)i, conf_files[i].path); return TRUE; } diff --git a/src/sci-modules.c b/src/sci-modules.c index 291942cb471ab62b62566757f4c328968c5b830d..0cf33c1752f902fe0ecccf375e7c3debb0c8fff7 100644 --- a/src/sci-modules.c +++ b/src/sci-modules.c @@ -69,7 +69,7 @@ static void sci_modules_load(gchar **modlist) { struct sci_module *module = g_malloc(sizeof(*module)); module->name = g_strdup(modlist[i]); - gchar *tmp = g_module_build_path(path, modlist[i]); + gchar *tmp = g_strconcat(path, "/", modlist[i], NULL);//g_module_build_path(path, modlist[i]); sci_log(LL_DEBUG, "Loading module: %s from %s", modlist[i], path);