diff --git a/.gitignore b/.gitignore index 7fd4f68..f1976ca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,27 +1,22 @@ -# ignore all files and directories -* -# allow git to enter directories -!*/ -venv/ -core/crowdsec/config -database -data -db -docs -examples -searx -tests -utils -stockfill -shift-recorder -# keep essential project files -!.gitignore -!.gitattributes +#* +#!**/ +#!.gitignore +#!**/Dockerfile +#!docker-compose.yml +**/data/ +**/db/ +**/database/ +apps/nextcloud/config/ +core/crowdsec/config/ +**/.env + +apps/stockfill/ +apps/shift-recorder/ + +!apps/searxng/ +apps/searxng/* + +!apps/searxng/Dockerfile +!apps/searxng/docker-compose.yml +venv/ -# allow YAMLs, shell scripts, and others -!*.yml -!*.yaml -!*.sh -!*.py -!*.Dockerfile -!Dockerfile diff --git a/.gitignore.gpt b/.gitignore.gpt new file mode 100644 index 0000000..a2b5e68 --- /dev/null +++ b/.gitignore.gpt @@ -0,0 +1,73 @@ +# Git internals accidentally nested in subprojects +# Gramps persistent data and postgres cluster +apps/gramps/data/ +apps/gramps/db/ + +# Nextcloud persistent data +apps/nextcloud/data/ +apps/nextcloud/database/ +apps/nextcloud/config/ + +# Passbolt secrets and database +apps/passbolt/data/ + +# Android / Gradle build artifacts +apps/shift-recorder/.gradle/ +apps/shift-recorder/app/build/ +apps/shift-recorder/build/ +apps/shift-recorder/app-release-*.apk +apps/shift-recorder/app-release-*.aab +apps/shift-recorder/*.idsig +apps/shift-recorder/android.keystore +apps/shift-recorder/manifest-checksum.txt + +apps/stockfill/.gradle/ +apps/stockfill/app/build/ +apps/stockfill/build/ +apps/stockfill/app-release-*.apk +apps/stockfill/app-release-*.aab +apps/stockfill/*.idsig +apps/stockfill/android.keystore +apps/stockfill/manifest-checksum.txt + +# Node / frontend artifacts +**/node_modules/ +**/dist/ +**/.vite/ +**/coverage/ +**/playwright-report/ +**/test-results/ + +# Python artifacts +**/__pycache__/ +**/*.pyc + +# IDE files +**/.vscode/ +apps/shift-recorder/ +#!apps/shift-recorder/.vscode/launch.json +apps/stockfill/ +#!apps/stockfill/.vscode/launch.json +#!apps/stockfill/.vscode/settings.json + +# Traefik data +core/traefik/data/ + +# Authelia sqlite database and notifications +core/authelia/data/ + +# Crowdsec generated state +core/crowdsec/data/ +core/crowdsec/logs/ +core/crowdsec/config +# Archived service data +archive/esphome/data/ + +monitoring/gotify/data/ +monitoring/grafana/data/ +monitoring/portainer/data/ +apps/gitea/data/ +apps/searxng/ +monitoring/prometheus/data/ +monitoring/uptime-kuma/data/ +# Keep compose files, Dockerfiles, configs, scripts, and documentation tracked diff --git a/.gitignore.old b/.gitignore.old new file mode 100644 index 0000000..7fd4f68 --- /dev/null +++ b/.gitignore.old @@ -0,0 +1,27 @@ +# ignore all files and directories +* +# allow git to enter directories +!*/ +venv/ +core/crowdsec/config +database +data +db +docs +examples +searx +tests +utils +stockfill +shift-recorder +# keep essential project files +!.gitignore +!.gitattributes + +# allow YAMLs, shell scripts, and others +!*.yml +!*.yaml +!*.sh +!*.py +!*.Dockerfile +!Dockerfile diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-environment-setup-in-conftest.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-environment-setup-in-conftest.py deleted file mode 100644 index 17ffce6..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-environment-setup-in-conftest.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 8af5ca464225c888f5438a0fd226937e2ccabca4 Gitea 1757451832 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-geocode.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-geocode.py deleted file mode 100644 index cd73075..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-geocode.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 bb76e891d59a88beeb4f5b233cbecfd94a8f0cae Gitea 1756461833 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-route_metrics.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-route_metrics.py deleted file mode 100644 index 2621d06..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-route_metrics.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 f5998826f6b9ed820f5a97a67ce11402e72fd2f3 Gitea 1756461833 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-tracking-simulator.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-tracking-simulator.py deleted file mode 100644 index bf22e80..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-tracking-simulator.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 791c61ee8129113a02d67ffa171ed48843cdf025 Gitea 1756815232 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/extend-sqlite-tuning-in-database.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/extend-sqlite-tuning-in-database.py deleted file mode 100644 index b5ed21e..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/extend-sqlite-tuning-in-database.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 e76063242b47225e6614f17749b1bf2ba0b70ac9 Gitea 1757407432 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/fix-route-handling-in-routing.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/fix-route-handling-in-routing.py deleted file mode 100644 index 6a1e1fb..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/fix-route-handling-in-routing.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 1c93f2ab9ceef7601c5db93de0ddb057aef5b4b1 Gitea 1756433632 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/handle-api-response-errors-in-routing.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/handle-api-response-errors-in-routing.py deleted file mode 100644 index fa40dd5..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/handle-api-response-errors-in-routing.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 0404d47f0ba21269865f307f3fd53e746ff155a4 Gitea 1756436033 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/refactor-database-path-handling-in-database.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/refactor-database-path-handling-in-database.py deleted file mode 100644 index 812244d..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/refactor-database-path-handling-in-database.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 80111cd7579abc6319f5d357da060db8186babaf Gitea 1758786607 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-fcm-message-construction-in-notifications.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-fcm-message-construction-in-notifications.py deleted file mode 100644 index eac11ef..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-fcm-message-construction-in-notifications.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 0fea5ebd8be8d93f95630bdc5cc9ecc0b0bbac43 Gitea 1756949032 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-role-check-in-ws.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-role-check-in-ws.py deleted file mode 100644 index 35b4da7..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-role-check-in-ws.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 7b87a0e2a6c03e5344da2fe6a391c1f1fb269b5c Gitea 1756851832 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-user-seed-in-database.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-user-seed-in-database.py deleted file mode 100644 index 5623d8e..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-user-seed-in-database.py +++ /dev/null @@ -1 +0,0 @@ -0000000000000000000000000000000000000000 bde15e4b736be753b1272a3277f528d7eb75d371 Gitea 1756508033 +0000 fetch --prune --tags origin: storing head diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-environment-setup-in-conftest.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-environment-setup-in-conftest.py deleted file mode 100644 index 5b90c94..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-environment-setup-in-conftest.py +++ /dev/null @@ -1 +0,0 @@ -8af5ca464225c888f5438a0fd226937e2ccabca4 diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-geocode.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-geocode.py deleted file mode 100644 index 7266f8f..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-geocode.py +++ /dev/null @@ -1 +0,0 @@ -bb76e891d59a88beeb4f5b233cbecfd94a8f0cae diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-route_metrics.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-route_metrics.py deleted file mode 100644 index 6df0506..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-route_metrics.py +++ /dev/null @@ -1 +0,0 @@ -f5998826f6b9ed820f5a97a67ce11402e72fd2f3 diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-tracking-simulator.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-tracking-simulator.py deleted file mode 100644 index a2076f0..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-tracking-simulator.py +++ /dev/null @@ -1 +0,0 @@ -791c61ee8129113a02d67ffa171ed48843cdf025 diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/extend-sqlite-tuning-in-database.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/extend-sqlite-tuning-in-database.py deleted file mode 100644 index cd8c913..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/extend-sqlite-tuning-in-database.py +++ /dev/null @@ -1 +0,0 @@ -e76063242b47225e6614f17749b1bf2ba0b70ac9 diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/fix-route-handling-in-routing.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/fix-route-handling-in-routing.py deleted file mode 100644 index ef73ce7..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/fix-route-handling-in-routing.py +++ /dev/null @@ -1 +0,0 @@ -1c93f2ab9ceef7601c5db93de0ddb057aef5b4b1 diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/handle-api-response-errors-in-routing.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/handle-api-response-errors-in-routing.py deleted file mode 100644 index 0186adc..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/handle-api-response-errors-in-routing.py +++ /dev/null @@ -1 +0,0 @@ -0404d47f0ba21269865f307f3fd53e746ff155a4 diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/refactor-database-path-handling-in-database.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/refactor-database-path-handling-in-database.py deleted file mode 100644 index 4b75700..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/refactor-database-path-handling-in-database.py +++ /dev/null @@ -1 +0,0 @@ -80111cd7579abc6319f5d357da060db8186babaf diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-fcm-message-construction-in-notifications.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-fcm-message-construction-in-notifications.py deleted file mode 100644 index 7fdace2..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-fcm-message-construction-in-notifications.py +++ /dev/null @@ -1 +0,0 @@ -0fea5ebd8be8d93f95630bdc5cc9ecc0b0bbac43 diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-role-check-in-ws.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-role-check-in-ws.py deleted file mode 100644 index dd3247b..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-role-check-in-ws.py +++ /dev/null @@ -1 +0,0 @@ -7b87a0e2a6c03e5344da2fe6a391c1f1fb269b5c diff --git a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-user-seed-in-database.py b/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-user-seed-in-database.py deleted file mode 100644 index 081ed4c..0000000 --- a/apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-user-seed-in-database.py +++ /dev/null @@ -1 +0,0 @@ -bde15e4b736be753b1272a3277f528d7eb75d371 diff --git a/apps/searxng/dockerfiles/docker-entrypoint.sh b/apps/searxng/dockerfiles/docker-entrypoint.sh deleted file mode 100755 index 332d5c2..0000000 --- a/apps/searxng/dockerfiles/docker-entrypoint.sh +++ /dev/null @@ -1,178 +0,0 @@ -#!/bin/sh - -help() { - cat </dev/null -} - -SEARXNG_VERSION="$(get_searxng_version)" -export SEARXNG_VERSION -echo "SearXNG version ${SEARXNG_VERSION}" - -# helpers to update the configuration files -patch_uwsgi_settings() { - CONF="$1" - - # update uwsg.ini - sed -i \ - -e "s|workers = .*|workers = ${UWSGI_WORKERS:-%k}|g" \ - -e "s|threads = .*|threads = ${UWSGI_THREADS:-4}|g" \ - "${CONF}" -} - -patch_searxng_settings() { - CONF="$1" - - # Make sure that there is trailing slash at the end of BASE_URL - # see https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#Shell-Parameter-Expansion - export BASE_URL="${BASE_URL%/}/" - - # update settings.yml - sed -i \ - -e "s|base_url: false|base_url: ${BASE_URL}|g" \ - -e "s/instance_name: \"SearXNG\"/instance_name: \"${INSTANCE_NAME}\"/g" \ - -e "s/autocomplete: \"\"/autocomplete: \"${AUTOCOMPLETE}\"/g" \ - -e "s/ultrasecretkey/$(openssl rand -hex 32)/g" \ - "${CONF}" - - # Morty configuration - - if [ -n "${MORTY_KEY}" ] && [ -n "${MORTY_URL}" ]; then - sed -i -e "s/image_proxy: false/image_proxy: true/g" \ - "${CONF}" - cat >> "${CONF}" <<-EOF - -# Morty configuration -result_proxy: - url: ${MORTY_URL} - key: !!binary "${MORTY_KEY}" -EOF - fi -} - -update_conf() { - FORCE_CONF_UPDATE=$1 - CONF="$2" - NEW_CONF="${2}.new" - OLD_CONF="${2}.old" - REF_CONF="$3" - PATCH_REF_CONF="$4" - - if [ -f "${CONF}" ]; then - if [ "${REF_CONF}" -nt "${CONF}" ]; then - # There is a new version - if [ "$FORCE_CONF_UPDATE" -ne 0 ]; then - # Replace the current configuration - printf '⚠️ Automatically update %s to the new version\n' "${CONF}" - if [ ! -f "${OLD_CONF}" ]; then - printf 'The previous configuration is saved to %s\n' "${OLD_CONF}" - mv "${CONF}" "${OLD_CONF}" - fi - cp "${REF_CONF}" "${CONF}" - $PATCH_REF_CONF "${CONF}" - else - # Keep the current configuration - printf '⚠️ Check new version %s to make sure SearXNG is working properly\n' "${NEW_CONF}" - cp "${REF_CONF}" "${NEW_CONF}" - $PATCH_REF_CONF "${NEW_CONF}" - fi - else - printf 'Use existing %s\n' "${CONF}" - fi - else - printf 'Create %s\n' "${CONF}" - cp "${REF_CONF}" "${CONF}" - $PATCH_REF_CONF "${CONF}" - fi -} - -# searx compatibility: copy /etc/searx/* to /etc/searxng/* -SEARX_CONF=0 -if [ -f "/etc/searx/settings.yml" ]; then - if [ ! -f "${SEARXNG_SETTINGS_PATH}" ]; then - printf '⚠️ /etc/searx/settings.yml is copied to /etc/searxng\n' - cp "/etc/searx/settings.yml" "${SEARXNG_SETTINGS_PATH}" - fi - SEARX_CONF=1 -fi -if [ -f "/etc/searx/uwsgi.ini" ]; then - printf '⚠️ /etc/searx/uwsgi.ini is ignored. Use the volume /etc/searxng\n' - SEARX_CONF=1 -fi -if [ "$SEARX_CONF" -eq "1" ]; then - printf '⚠️ The deprecated volume /etc/searx is mounted. Please update your configuration to use /etc/searxng ⚠️\n' - cat << EOF > /etc/searx/deprecated_volume_read_me.txt -This Docker image uses the volume /etc/searxng -Update your configuration: -* remove uwsgi.ini (or very carefully update your existing uwsgi.ini using https://github.com/searxng/searxng/blob/master/dockerfiles/uwsgi.ini ) -* mount /etc/searxng instead of /etc/searx -EOF -fi -# end of searx compatibility - -# make sure there are uwsgi settings -update_conf "${FORCE_CONF_UPDATE}" "${UWSGI_SETTINGS_PATH}" "/usr/local/searxng/dockerfiles/uwsgi.ini" "patch_uwsgi_settings" - -# make sure there are searxng settings -update_conf "${FORCE_CONF_UPDATE}" "${SEARXNG_SETTINGS_PATH}" "/usr/local/searxng/searx/settings.yml" "patch_searxng_settings" - -# dry run (to update configuration files, then inspect them) -if [ $DRY_RUN -eq 1 ]; then - printf 'Dry run\n' - exit -fi - -unset MORTY_KEY - -# Start uwsgi -printf 'Listen on %s\n' "${BIND_ADDRESS}" -exec su-exec searxng:searxng uwsgi --master --http-socket "${BIND_ADDRESS}" "${UWSGI_SETTINGS_PATH}" diff --git a/apps/searxng/docs/conf.py b/apps/searxng/docs/conf.py deleted file mode 100644 index aa4905e..0000000 --- a/apps/searxng/docs/conf.py +++ /dev/null @@ -1,205 +0,0 @@ -# -*- coding: utf-8 -*- -# SPDX-License-Identifier: AGPL-3.0-or-later - -import sys, os -from pallets_sphinx_themes import ProjectLink - -from searx import get_setting -from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH - -# Project -------------------------------------------------------------- - -project = 'SearXNG' -copyright = 'SearXNG team' -author = 'SearXNG team' -release, version = VERSION_STRING, VERSION_STRING - -SEARXNG_URL = get_setting('server.base_url') or 'https://example.org/searxng' -ISSUE_URL = get_setting('brand.issue_url') -DOCS_URL = get_setting('brand.docs_url') -PUBLIC_INSTANCES = get_setting('brand.public_instances') -PRIVACYPOLICY_URL = get_setting('general.privacypolicy_url') -CONTACT_URL = get_setting('general.contact_url') -WIKI_URL = get_setting('brand.wiki_url') - -# hint: sphinx.ext.viewcode won't highlight when 'highlight_language' [1] is set -# to string 'none' [2] -# -# [1] https://www.sphinx-doc.org/en/master/usage/extensions/viewcode.html -# [2] https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-highlight_language - -highlight_language = 'default' - -# General -------------------------------------------------------------- - -master_doc = "index" -source_suffix = '.rst' -numfig = True - -exclude_patterns = ['build-templates/*.rst', 'user/*.md'] - -import searx.engines -import searx.plugins -import searx.webutils - -# import searx.webapp is needed to init the engines & plugins, to init a -# (empty) secret_key is needed. -searx.settings['server']['secret_key'] = '' -import searx.webapp - -searx.engines.load_engines(searx.settings['engines']) - -jinja_contexts = { - 'searx': { - 'engines': searx.engines.engines, - 'plugins': searx.plugins.plugins, - 'version': { - 'node': os.getenv('NODE_MINIMUM_VERSION') - }, - 'enabled_engine_count': sum(not x.disabled for x in searx.engines.engines.values()), - 'categories': searx.engines.categories, - 'categories_as_tabs': {c: searx.engines.categories[c] for c in searx.settings['categories_as_tabs']}, - }, -} -jinja_filters = { - 'group_engines_in_tab': searx.webutils.group_engines_in_tab, -} - -# Let the Jinja template in configured_engines.rst access documented_modules -# to automatically link documentation for modules if it exists. -def setup(app): - ENGINES_DOCNAME = 'user/configured_engines' - - def before_read_docs(app, env, docnames): - assert ENGINES_DOCNAME in docnames - docnames.remove(ENGINES_DOCNAME) - docnames.append(ENGINES_DOCNAME) - # configured_engines must come last so that sphinx already has - # discovered the python module documentations - - def source_read(app, docname, source): - if docname == ENGINES_DOCNAME: - jinja_contexts['searx']['documented_modules'] = app.env.domains['py'].modules - - app.connect('env-before-read-docs', before_read_docs) - app.connect('source-read', source_read) - -# usage:: lorem :patch:`f373169` ipsum -extlinks = {} - -# upstream links -extlinks['wiki'] = ('https://github.com/searxng/searxng/wiki/%s', ' %s') -extlinks['pull'] = ('https://github.com/searxng/searxng/pull/%s', 'PR %s') -extlinks['pull-searx'] = ('https://github.com/searx/searx/pull/%s', 'PR %s') - -# links to custom brand -extlinks['origin'] = (GIT_URL + '/blob/' + GIT_BRANCH + '/%s', 'git://%s') -extlinks['patch'] = (GIT_URL + '/commit/%s', '#%s') -extlinks['docs'] = (DOCS_URL + '/%s', 'docs: %s') -extlinks['pypi'] = ('https://pypi.org/project/%s', 'PyPi: %s') -extlinks['man'] = ('https://manpages.debian.org/jump?q=%s', '%s') -#extlinks['role'] = ( -# 'https://www.sphinx-doc.org/en/master/usage/restructuredtext/roles.html#role-%s', '') -extlinks['duref'] = ( - 'https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#%s', '%s') -extlinks['durole'] = ( - 'https://docutils.sourceforge.io/docs/ref/rst/roles.html#%s', '%s') -extlinks['dudir'] = ( - 'https://docutils.sourceforge.io/docs/ref/rst/directives.html#%s', '%s') -extlinks['ctan'] = ( - 'https://ctan.org/pkg/%s', 'CTAN: %s') - -extensions = [ - 'sphinx.ext.imgmath', - 'sphinx.ext.extlinks', - 'sphinx.ext.viewcode', - "sphinx.ext.autodoc", - "sphinx.ext.intersphinx", - "pallets_sphinx_themes", - "sphinx_issues", # https://github.com/sloria/sphinx-issues/blob/master/README.rst - "sphinx_jinja", # https://github.com/tardyp/sphinx-jinja - "sphinxcontrib.programoutput", # https://github.com/NextThought/sphinxcontrib-programoutput - 'linuxdoc.kernel_include', # Implementation of the 'kernel-include' reST-directive. - 'linuxdoc.rstFlatTable', # Implementation of the 'flat-table' reST-directive. - 'linuxdoc.kfigure', # Sphinx extension which implements scalable image handling. - "sphinx_tabs.tabs", # https://github.com/djungelorm/sphinx-tabs - 'myst_parser', # https://www.sphinx-doc.org/en/master/usage/markdown.html - 'notfound.extension', # https://github.com/readthedocs/sphinx-notfound-page -] - -autodoc_default_options = { - 'member-order': 'groupwise', -} - -myst_enable_extensions = [ - "replacements", "smartquotes" -] - -suppress_warnings = ['myst.domains'] - -intersphinx_mapping = { - "python": ("https://docs.python.org/3/", None), - "babel" : ("https://babel.readthedocs.io/en/latest/", None), - "flask": ("https://flask.palletsprojects.com/", None), - "flask_babel": ("https://python-babel.github.io/flask-babel/", None), - # "werkzeug": ("https://werkzeug.palletsprojects.com/", None), - "jinja": ("https://jinja.palletsprojects.com/", None), - "linuxdoc" : ("https://return42.github.io/linuxdoc/", None), - "sphinx" : ("https://www.sphinx-doc.org/en/master/", None), - "redis": ('https://redis.readthedocs.io/en/stable/', None), -} - -issues_github_path = "searxng/searxng" - -# HTML ----------------------------------------------------------------- - -# https://searxng.github.io/searxng --> '/searxng/' -# https://docs.searxng.org --> '/' -notfound_urls_prefix = '/' - -sys.path.append(os.path.abspath('_themes')) -sys.path.insert(0, os.path.abspath("../utils/")) -html_theme_path = ['_themes'] -html_theme = "searxng" - -# sphinx.ext.imgmath setup -html_math_renderer = 'imgmath' -imgmath_image_format = 'svg' -imgmath_font_size = 14 -# sphinx.ext.imgmath setup END - -html_show_sphinx = False -html_theme_options = {"index_sidebar_logo": True} -html_context = {"project_links": [] } -html_context["project_links"].append(ProjectLink("Source", GIT_URL + '/tree/' + GIT_BRANCH)) - -if WIKI_URL: - html_context["project_links"].append(ProjectLink("Wiki", WIKI_URL)) -if PUBLIC_INSTANCES: - html_context["project_links"].append(ProjectLink("Public instances", PUBLIC_INSTANCES)) -if ISSUE_URL: - html_context["project_links"].append(ProjectLink("Issue Tracker", ISSUE_URL)) -if PRIVACYPOLICY_URL: - html_context["project_links"].append(ProjectLink("Privacy Policy", PRIVACYPOLICY_URL)) -if CONTACT_URL: - html_context["project_links"].append(ProjectLink("Contact", CONTACT_URL)) - -html_sidebars = { - "**": [ - "globaltoc.html", - "project.html", - "relations.html", - "searchbox.html", - "sourcelink.html" - ], -} -singlehtml_sidebars = {"index": ["project.html", "localtoc.html"]} -html_logo = "../src/brand/searxng-wordmark.svg" -html_title = "SearXNG Documentation ({})".format(VERSION_STRING) -html_show_sourcelink = True - -# LaTeX ---------------------------------------------------------------- - -latex_documents = [ - (master_doc, "searxng-{}.tex".format(VERSION_STRING), html_title, author, "manual") -] diff --git a/apps/searxng/docs/user/.gitignore b/apps/searxng/docs/user/.gitignore deleted file mode 100644 index 2e1fa2d..0000000 --- a/apps/searxng/docs/user/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.md \ No newline at end of file diff --git a/apps/searxng/examples/basic_engine.py b/apps/searxng/examples/basic_engine.py deleted file mode 100644 index c7d02af..0000000 --- a/apps/searxng/examples/basic_engine.py +++ /dev/null @@ -1,25 +0,0 @@ - -categories = ['general'] # optional - - -def request(query, params): - '''pre-request callback - params: - method : POST/GET - headers : {} - data : {} # if method == POST - url : '' - category: 'search category' - pageno : 1 # number of the requested page - ''' - - params['url'] = 'https://host/%s' % query - - return params - - -def response(resp): - '''post-response callback - resp: requests response object - ''' - return [{'url': '', 'title': '', 'content': ''}] diff --git a/apps/searxng/searx/__init__.py b/apps/searxng/searx/__init__.py deleted file mode 100755 index d2d389e..0000000 --- a/apps/searxng/searx/__init__.py +++ /dev/null @@ -1,106 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring - -import sys -import os -from os.path import dirname, abspath - -import logging - -import searx.unixthreadname -import searx.settings_loader -from searx.settings_defaults import settings_set_defaults - - -# Debug -LOG_FORMAT_DEBUG = '%(levelname)-7s %(name)-30.30s: %(message)s' - -# Production -LOG_FORMAT_PROD = '%(asctime)-15s %(levelname)s:%(name)s: %(message)s' -LOG_LEVEL_PROD = logging.WARNING - -searx_dir = abspath(dirname(__file__)) -searx_parent_dir = abspath(dirname(dirname(__file__))) -settings, settings_load_message = searx.settings_loader.load_settings() - -if settings is not None: - settings = settings_set_defaults(settings) - -_unset = object() - - -def get_setting(name, default=_unset): - """Returns the value to which ``name`` point. If there is no such name in the - settings and the ``default`` is unset, a :py:obj:`KeyError` is raised. - - """ - value = settings - for a in name.split('.'): - if isinstance(value, dict): - value = value.get(a, _unset) - else: - value = _unset - - if value is _unset: - if default is _unset: - raise KeyError(name) - value = default - break - - return value - - -def is_color_terminal(): - if os.getenv('TERM') in ('dumb', 'unknown'): - return False - return sys.stdout.isatty() - - -def logging_config_debug(): - try: - import coloredlogs # pylint: disable=import-outside-toplevel - except ImportError: - coloredlogs = None - - log_level = os.environ.get('SEARXNG_DEBUG_LOG_LEVEL', 'DEBUG') - if coloredlogs and is_color_terminal(): - level_styles = { - 'spam': {'color': 'green', 'faint': True}, - 'debug': {}, - 'notice': {'color': 'magenta'}, - 'success': {'bold': True, 'color': 'green'}, - 'info': {'bold': True, 'color': 'cyan'}, - 'warning': {'color': 'yellow'}, - 'error': {'color': 'red'}, - 'critical': {'bold': True, 'color': 'red'}, - } - field_styles = { - 'asctime': {'color': 'green'}, - 'hostname': {'color': 'magenta'}, - 'levelname': {'color': 8}, - 'name': {'color': 8}, - 'programname': {'color': 'cyan'}, - 'username': {'color': 'yellow'}, - } - coloredlogs.install(level=log_level, level_styles=level_styles, field_styles=field_styles, fmt=LOG_FORMAT_DEBUG) - else: - logging.basicConfig(level=logging.getLevelName(log_level), format=LOG_FORMAT_DEBUG) - - -searx_debug = settings['general']['debug'] -if searx_debug: - logging_config_debug() -else: - logging.basicConfig(level=LOG_LEVEL_PROD, format=LOG_FORMAT_PROD) - logging.root.setLevel(level=LOG_LEVEL_PROD) - logging.getLogger('werkzeug').setLevel(level=LOG_LEVEL_PROD) -logger = logging.getLogger('searx') -logger.info(settings_load_message) - -# log max_request_timeout -max_request_timeout = settings['outgoing']['max_request_timeout'] -if max_request_timeout is None: - logger.info('max_request_timeout=%s', repr(max_request_timeout)) -else: - logger.info('max_request_timeout=%i second(s)', max_request_timeout) diff --git a/apps/searxng/searx/answerers/__init__.py b/apps/searxng/searx/answerers/__init__.py deleted file mode 100755 index 8e2b9b3..0000000 --- a/apps/searxng/searx/answerers/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -from os import listdir -from os.path import realpath, dirname, join, isdir -from searx.utils import load_module -from collections import defaultdict - - -answerers_dir = dirname(realpath(__file__)) - - -def load_answerers(): - answerers = [] - for filename in listdir(answerers_dir): - if not isdir(join(answerers_dir, filename)) or filename.startswith('_'): - continue - module = load_module('answerer.py', join(answerers_dir, filename)) - if not hasattr(module, 'keywords') or not isinstance(module.keywords, tuple) or not len(module.keywords): - exit(2) - answerers.append(module) - return answerers - - -def get_answerers_by_keywords(answerers): - by_keyword = defaultdict(list) - for answerer in answerers: - for keyword in answerer.keywords: - for keyword in answerer.keywords: - by_keyword[keyword].append(answerer.answer) - return by_keyword - - -def ask(query): - results = [] - query_parts = list(filter(None, query.query.split())) - - if not query_parts or query_parts[0] not in answerers_by_keywords: - return results - - for answerer in answerers_by_keywords[query_parts[0]]: - result = answerer(query) - if result: - results.append(result) - return results - - -answerers = load_answerers() -answerers_by_keywords = get_answerers_by_keywords(answerers) diff --git a/apps/searxng/searx/answerers/random/answerer.py b/apps/searxng/searx/answerers/random/answerer.py deleted file mode 100755 index 059dd7c..0000000 --- a/apps/searxng/searx/answerers/random/answerer.py +++ /dev/null @@ -1,70 +0,0 @@ -import hashlib -import random -import string -import uuid -from flask_babel import gettext - -# required answerer attribute -# specifies which search query keywords triggers this answerer -keywords = ('random',) - -random_int_max = 2**31 -random_string_letters = string.ascii_lowercase + string.digits + string.ascii_uppercase - - -def random_characters(): - return [random.choice(random_string_letters) for _ in range(random.randint(8, 32))] - - -def random_string(): - return ''.join(random_characters()) - - -def random_float(): - return str(random.random()) - - -def random_int(): - return str(random.randint(-random_int_max, random_int_max)) - - -def random_sha256(): - m = hashlib.sha256() - m.update(''.join(random_characters()).encode()) - return str(m.hexdigest()) - - -def random_uuid(): - return str(uuid.uuid4()) - - -random_types = { - 'string': random_string, - 'int': random_int, - 'float': random_float, - 'sha256': random_sha256, - 'uuid': random_uuid, -} - - -# required answerer function -# can return a list of results (any result type) for a given query -def answer(query): - parts = query.query.split() - if len(parts) != 2: - return [] - - if parts[1] not in random_types: - return [] - - return [{'answer': random_types[parts[1]]()}] - - -# required answerer function -# returns information about the answerer -def self_info(): - return { - 'name': gettext('Random value generator'), - 'description': gettext('Generate different random values'), - 'examples': ['random {}'.format(x) for x in random_types], - } diff --git a/apps/searxng/searx/answerers/statistics/answerer.py b/apps/searxng/searx/answerers/statistics/answerer.py deleted file mode 100755 index 60f0d30..0000000 --- a/apps/searxng/searx/answerers/statistics/answerer.py +++ /dev/null @@ -1,50 +0,0 @@ -from functools import reduce -from operator import mul - -from flask_babel import gettext - - -keywords = ('min', 'max', 'avg', 'sum', 'prod') - - -# required answerer function -# can return a list of results (any result type) for a given query -def answer(query): - parts = query.query.split() - - if len(parts) < 2: - return [] - - try: - args = list(map(float, parts[1:])) - except: - return [] - - func = parts[0] - answer = None - - if func == 'min': - answer = min(args) - elif func == 'max': - answer = max(args) - elif func == 'avg': - answer = sum(args) / len(args) - elif func == 'sum': - answer = sum(args) - elif func == 'prod': - answer = reduce(mul, args, 1) - - if answer is None: - return [] - - return [{'answer': str(answer)}] - - -# required answerer function -# returns information about the answerer -def self_info(): - return { - 'name': gettext('Statistics functions'), - 'description': gettext('Compute {functions} of the arguments').format(functions='/'.join(keywords)), - 'examples': ['avg 123 548 2.04 24.2'], - } diff --git a/apps/searxng/searx/autocomplete.py b/apps/searxng/searx/autocomplete.py deleted file mode 100755 index ad9903f..0000000 --- a/apps/searxng/searx/autocomplete.py +++ /dev/null @@ -1,228 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This module implements functions needed for the autocompleter. - -""" -# pylint: disable=use-dict-literal - -import json -from urllib.parse import urlencode - -import lxml -from httpx import HTTPError - -from searx import settings -from searx.engines import ( - engines, - google, -) -from searx.network import get as http_get -from searx.exceptions import SearxEngineResponseException - - -def get(*args, **kwargs): - if 'timeout' not in kwargs: - kwargs['timeout'] = settings['outgoing']['request_timeout'] - kwargs['raise_for_httperror'] = True - return http_get(*args, **kwargs) - - -def brave(query, _lang): - # brave search autocompleter - url = 'https://search.brave.com/api/suggest?' - url += urlencode({'q': query}) - country = 'all' - # if lang in _brave: - # country = lang - kwargs = {'cookies': {'country': country}} - resp = get(url, **kwargs) - - results = [] - - if resp.ok: - data = resp.json() - for item in data[1]: - results.append(item) - return results - - -def dbpedia(query, _lang): - # dbpedia autocompleter, no HTTPS - autocomplete_url = 'https://lookup.dbpedia.org/api/search.asmx/KeywordSearch?' - - response = get(autocomplete_url + urlencode(dict(QueryString=query))) - - results = [] - - if response.ok: - dom = lxml.etree.fromstring(response.content) - results = dom.xpath('//Result/Label//text()') - - return results - - -def duckduckgo(query, sxng_locale): - """Autocomplete from DuckDuckGo. Supports DuckDuckGo's languages""" - - traits = engines['duckduckgo'].traits - args = { - 'q': query, - 'kl': traits.get_region(sxng_locale, traits.all_locale), - } - - url = 'https://duckduckgo.com/ac/?type=list&' + urlencode(args) - resp = get(url) - - ret_val = [] - if resp.ok: - j = resp.json() - if len(j) > 1: - ret_val = j[1] - return ret_val - - -def google_complete(query, sxng_locale): - """Autocomplete from Google. Supports Google's languages and subdomains - (:py:obj:`searx.engines.google.get_google_info`) by using the async REST - API:: - - https://{subdomain}/complete/search?{args} - - """ - - google_info = google.get_google_info({'searxng_locale': sxng_locale}, engines['google'].traits) - - url = 'https://{subdomain}/complete/search?{args}' - args = urlencode( - { - 'q': query, - 'client': 'gws-wiz', - 'hl': google_info['params']['hl'], - } - ) - results = [] - resp = get(url.format(subdomain=google_info['subdomain'], args=args)) - if resp.ok: - json_txt = resp.text[resp.text.find('[') : resp.text.find(']', -3) + 1] - data = json.loads(json_txt) - for item in data[0]: - results.append(lxml.html.fromstring(item[0]).text_content()) - return results - - -def seznam(query, _lang): - # seznam search autocompleter - url = 'https://suggest.seznam.cz/fulltext/cs?{query}' - - resp = get( - url.format( - query=urlencode( - {'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'} - ) - ) - ) - - if not resp.ok: - return [] - - data = resp.json() - return [ - ''.join([part.get('text', '') for part in item.get('text', [])]) - for item in data.get('result', []) - if item.get('itemType', None) == 'ItemType.TEXT' - ] - - -def startpage(query, sxng_locale): - """Autocomplete from Startpage. Supports Startpage's languages""" - lui = engines['startpage'].traits.get_language(sxng_locale, 'english') - url = 'https://startpage.com/suggestions?{query}' - resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui}))) - data = resp.json() - return [e['text'] for e in data.get('suggestions', []) if 'text' in e] - - -def swisscows(query, _lang): - # swisscows autocompleter - url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5' - - resp = json.loads(get(url.format(query=urlencode({'query': query}))).text) - return resp - - -def qwant(query, sxng_locale): - """Autocomplete from Qwant. Supports Qwant's regions.""" - results = [] - - locale = engines['qwant'].traits.get_region(sxng_locale, 'en_US') - url = 'https://api.qwant.com/v3/suggest?{query}' - resp = get(url.format(query=urlencode({'q': query, 'locale': locale, 'version': '2'}))) - - if resp.ok: - data = resp.json() - if data['status'] == 'success': - for item in data['data']['items']: - results.append(item['value']) - - return results - - -def wikipedia(query, sxng_locale): - """Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc).""" - results = [] - eng_traits = engines['wikipedia'].traits - wiki_lang = eng_traits.get_language(sxng_locale, 'en') - wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org') - - url = 'https://{wiki_netloc}/w/api.php?{args}' - args = urlencode( - { - 'action': 'opensearch', - 'format': 'json', - 'formatversion': '2', - 'search': query, - 'namespace': '0', - 'limit': '10', - } - ) - resp = get(url.format(args=args, wiki_netloc=wiki_netloc)) - if resp.ok: - data = resp.json() - if len(data) > 1: - results = data[1] - - return results - - -def yandex(query, _lang): - # yandex autocompleter - url = "https://suggest.yandex.com/suggest-ff.cgi?{0}" - - resp = json.loads(get(url.format(urlencode(dict(part=query)))).text) - if len(resp) > 1: - return resp[1] - return [] - - -backends = { - 'dbpedia': dbpedia, - 'duckduckgo': duckduckgo, - 'google': google_complete, - 'seznam': seznam, - 'startpage': startpage, - 'swisscows': swisscows, - 'qwant': qwant, - 'wikipedia': wikipedia, - 'brave': brave, - 'yandex': yandex, -} - - -def search_autocomplete(backend_name, query, sxng_locale): - backend = backends.get(backend_name) - if backend is None: - return [] - try: - return backend(query, sxng_locale) - except (HTTPError, SearxEngineResponseException): - return [] diff --git a/apps/searxng/searx/babel_extract.py b/apps/searxng/searx/babel_extract.py deleted file mode 100755 index 5bcb1f0..0000000 --- a/apps/searxng/searx/babel_extract.py +++ /dev/null @@ -1,51 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This module implements the :origin:`searxng_msg ` extractor to -extract messages from: - -- :origin:`searx/searxng.msg` - -The ``searxng.msg`` files are selected by Babel_, see Babel's configuration in -:origin:`babel.cfg`:: - - searxng_msg = searx.babel_extract.extract - ... - [searxng_msg: **/searxng.msg] - -A ``searxng.msg`` file is a python file that is *executed* by the -:py:obj:`extract` function. Additional ``searxng.msg`` files can be added by: - -1. Adding a ``searxng.msg`` file in one of the SearXNG python packages and -2. implement a method in :py:obj:`extract` that yields messages from this file. - -.. _Babel: https://babel.pocoo.org/en/latest/index.html - -""" - -from os import path - -SEARXNG_MSG_FILE = "searxng.msg" -_MSG_FILES = [path.join(path.dirname(__file__), SEARXNG_MSG_FILE)] - - -def extract( - # pylint: disable=unused-argument - fileobj, - keywords, - comment_tags, - options, -): - """Extract messages from ``searxng.msg`` files by a custom extractor_. - - .. _extractor: - https://babel.pocoo.org/en/latest/messages.html#writing-extraction-methods - """ - if fileobj.name not in _MSG_FILES: - raise RuntimeError("don't know how to extract messages from %s" % fileobj.name) - - namespace = {} - exec(fileobj.read(), {}, namespace) # pylint: disable=exec-used - - for name in namespace['__all__']: - for k, v in namespace[name].items(): - yield 0, '_', v, ["%s['%s']" % (name, k)] diff --git a/apps/searxng/searx/botdetection/__init__.py b/apps/searxng/searx/botdetection/__init__.py deleted file mode 100755 index fcd8e56..0000000 --- a/apps/searxng/searx/botdetection/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. _botdetection src: - -X-Forwarded-For -=============== - -.. attention:: - - A correct setup of the HTTP request headers ``X-Forwarded-For`` and - ``X-Real-IP`` is essential to be able to assign a request to an IP correctly: - - - `NGINX RequestHeader`_ - - `Apache RequestHeader`_ - -.. _NGINX RequestHeader: - https://docs.searxng.org/admin/installation-nginx.html#nginx-s-searxng-site -.. _Apache RequestHeader: - https://docs.searxng.org/admin/installation-apache.html#apache-s-searxng-site - -.. autofunction:: searx.botdetection.get_real_ip - -""" - -from ._helpers import dump_request -from ._helpers import get_real_ip -from ._helpers import too_many_requests diff --git a/apps/searxng/searx/botdetection/_helpers.py b/apps/searxng/searx/botdetection/_helpers.py deleted file mode 100755 index 19905fd..0000000 --- a/apps/searxng/searx/botdetection/_helpers.py +++ /dev/null @@ -1,120 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring, invalid-name -from __future__ import annotations - -from ipaddress import ( - IPv4Network, - IPv6Network, - IPv4Address, - IPv6Address, - ip_network, -) -import flask -import werkzeug - -from searx.tools import config -from searx import logger - -logger = logger.getChild('botdetection') - - -def dump_request(request: flask.Request): - return ( - request.path - + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') - + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') - + " || form: %s" % request.form - + " || Accept: %s" % request.headers.get('Accept') - + " || Accept-Language: %s" % request.headers.get('Accept-Language') - + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') - + " || Content-Type: %s" % request.headers.get('Content-Type') - + " || Content-Length: %s" % request.headers.get('Content-Length') - + " || Connection: %s" % request.headers.get('Connection') - + " || User-Agent: %s" % request.headers.get('User-Agent') - ) - - -def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None: - """Returns a HTTP 429 response object and writes a ERROR message to the - 'botdetection' logger. This function is used in part by the filter methods - to return the default ``Too Many Requests`` response. - - """ - - logger.debug("BLOCK %s: %s", network.compressed, log_msg) - return flask.make_response(('Too Many Requests', 429)) - - -def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network: - """Returns the (client) network of whether the real_ip is part of.""" - - if real_ip.version == 6: - prefix = cfg['real_ip.ipv6_prefix'] - else: - prefix = cfg['real_ip.ipv4_prefix'] - network = ip_network(f"{real_ip}/{prefix}", strict=False) - # logger.debug("get_network(): %s", network.compressed) - return network - - -def get_real_ip(request: flask.Request) -> str: - """Returns real IP of the request. Since not all proxies set all the HTTP - headers and incoming headers can be faked it may happen that the IP cannot - be determined correctly. - - .. sidebar:: :py:obj:`flask.Request.remote_addr` - - SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). - - This function tries to get the remote IP in the order listed below, - additional some tests are done and if inconsistencies or errors are - detected, they are logged. - - The remote IP of the request is taken from (first match): - - - X-Forwarded-For_ header - - `X-real-IP header `__ - - :py:obj:`flask.Request.remote_addr` - - .. _ProxyFix: - https://werkzeug.palletsprojects.com/middleware/proxy_fix/ - - .. _X-Forwarded-For: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For - - """ - - forwarded_for = request.headers.get("X-Forwarded-For") - real_ip = request.headers.get('X-Real-IP') - remote_addr = request.remote_addr - # logger.debug( - # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr - # ) - - if not forwarded_for: - logger.error("X-Forwarded-For header is not set!") - else: - from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import - - forwarded_for = [x.strip() for x in forwarded_for.split(',')] - x_for: int = get_cfg()['real_ip.x_for'] # type: ignore - forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] - - if not real_ip: - logger.error("X-Real-IP header is not set!") - - if forwarded_for and real_ip and forwarded_for != real_ip: - logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for) - - if forwarded_for and remote_addr and forwarded_for != remote_addr: - logger.warning( - "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for - ) - - if real_ip and remote_addr and real_ip != remote_addr: - logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) - - request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' - # logger.debug("get_real_ip() -> %s", request_ip) - return request_ip diff --git a/apps/searxng/searx/botdetection/http_accept.py b/apps/searxng/searx/botdetection/http_accept.py deleted file mode 100755 index b78a862..0000000 --- a/apps/searxng/searx/botdetection/http_accept.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_accept`` ----------------------- - -The ``http_accept`` method evaluates a request as the request of a bot if the -Accept_ header .. - -- did not contain ``text/html`` - -.. _Accept: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from searx.tools import config -from ._helpers import too_many_requests - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - if 'text/html' not in request.accept_mimetypes: - return too_many_requests(network, "HTTP header Accept did not contain text/html") - return None diff --git a/apps/searxng/searx/botdetection/http_accept_encoding.py b/apps/searxng/searx/botdetection/http_accept_encoding.py deleted file mode 100755 index 60718a4..0000000 --- a/apps/searxng/searx/botdetection/http_accept_encoding.py +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_accept_encoding`` -------------------------------- - -The ``http_accept_encoding`` method evaluates a request as the request of a -bot if the Accept-Encoding_ header .. - -- did not contain ``gzip`` AND ``deflate`` (if both values are missed) -- did not contain ``text/html`` - -.. _Accept-Encoding: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from searx.tools import config -from ._helpers import too_many_requests - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] - if not ('gzip' in accept_list or 'deflate' in accept_list): - return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate") - return None diff --git a/apps/searxng/searx/botdetection/http_accept_language.py b/apps/searxng/searx/botdetection/http_accept_language.py deleted file mode 100755 index 395d28b..0000000 --- a/apps/searxng/searx/botdetection/http_accept_language.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_accept_language`` -------------------------------- - -The ``http_accept_language`` method evaluates a request as the request of a bot -if the Accept-Language_ header is unset. - -.. _Accept-Language: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent - -""" -# pylint: disable=unused-argument -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from searx.tools import config -from ._helpers import too_many_requests - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - if request.headers.get('Accept-Language', '').strip() == '': - return too_many_requests(network, "missing HTTP header Accept-Language") - return None diff --git a/apps/searxng/searx/botdetection/http_connection.py b/apps/searxng/searx/botdetection/http_connection.py deleted file mode 100755 index ee0d80a..0000000 --- a/apps/searxng/searx/botdetection/http_connection.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_connection`` --------------------------- - -The ``http_connection`` method evaluates a request as the request of a bot if -the Connection_ header is set to ``close``. - -.. _Connection: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from searx.tools import config -from ._helpers import too_many_requests - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - if request.headers.get('Connection', '').strip() == 'close': - return too_many_requests(network, "HTTP header 'Connection=close") - return None diff --git a/apps/searxng/searx/botdetection/http_user_agent.py b/apps/searxng/searx/botdetection/http_user_agent.py deleted file mode 100755 index 17025f6..0000000 --- a/apps/searxng/searx/botdetection/http_user_agent.py +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_user_agent`` --------------------------- - -The ``http_user_agent`` method evaluates a request as the request of a bot if -the User-Agent_ header is unset or matches the regular expression -:py:obj:`USER_AGENT`. - -.. _User-Agent: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -import re -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from searx.tools import config -from ._helpers import too_many_requests - - -USER_AGENT = ( - r'(' - + r'unknown' - + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' - + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' - + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' - + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' - + r'|ZmEu|BLEXBot|bitlybot' - # unmaintained Farside instances - + r'|' - + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') - # other bots and client to block - + '|.*PetalBot.*' - + r')' -) -"""Regular expression that matches to User-Agent_ from known *bots*""" - -_regexp = None - - -def regexp_user_agent(): - global _regexp # pylint: disable=global-statement - if not _regexp: - _regexp = re.compile(USER_AGENT) - return _regexp - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - user_agent = request.headers.get('User-Agent', 'unknown') - if regexp_user_agent().match(user_agent): - return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}") - return None diff --git a/apps/searxng/searx/botdetection/ip_limit.py b/apps/searxng/searx/botdetection/ip_limit.py deleted file mode 100755 index d0605dc..0000000 --- a/apps/searxng/searx/botdetection/ip_limit.py +++ /dev/null @@ -1,148 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. _botdetection.ip_limit: - -Method ``ip_limit`` -------------------- - -The ``ip_limit`` method counts request from an IP in *sliding windows*. If -there are to many requests in a sliding window, the request is evaluated as a -bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_ -header. To take privacy only the hash value of an IP is stored in the redis DB -and at least for a maximum of 10 minutes. - -The :py:obj:`.link_token` method can be used to investigate whether a request is -*suspicious*. To activate the :py:obj:`.link_token` method in the -:py:obj:`.ip_limit` method add the following to your -``/etc/searxng/limiter.toml``: - -.. code:: toml - - [botdetection.ip_limit] - link_token = true - -If the :py:obj:`.link_token` method is activated and a request is *suspicious* -the request rates are reduced: - -- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` -- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` - -To intercept bots that get their IPs from a range of IPs, there is a -:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored -for a longer time. IPs stored in this sliding window have a maximum of -:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP -makes a request that is not suspicious, the sliding window for this IP is -droped. - -.. _X-Forwarded-For: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For - -""" -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug -from searx.tools import config - -from searx import redisdb -from searx.redislib import incr_sliding_window, drop_counter - -from . import link_token -from ._helpers import ( - too_many_requests, - logger, -) - - -logger = logger.getChild('ip_limit') - -BURST_WINDOW = 20 -"""Time (sec) before sliding window for *burst* requests expires.""" - -BURST_MAX = 15 -"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`""" - -BURST_MAX_SUSPICIOUS = 2 -"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`""" - -LONG_WINDOW = 600 -"""Time (sec) before the longer sliding window expires.""" - -LONG_MAX = 150 -"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`""" - -LONG_MAX_SUSPICIOUS = 10 -"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`""" - -API_WONDOW = 3600 -"""Time (sec) before sliding window for API requests (format != html) expires.""" - -API_MAX = 4 -"""Maximum requests from one IP in the :py:obj:`API_WONDOW`""" - -SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30 -"""Time (sec) before sliding window for one suspicious IP expires.""" - -SUSPICIOUS_IP_MAX = 3 -"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - # pylint: disable=too-many-return-statements - redis_client = redisdb.client() - - if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']: - logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed) - return None - - if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW) - if c > API_MAX: - return too_many_requests(network, "too many request in API_WINDOW") - - if cfg['botdetection.ip_limit.link_token']: - - suspicious = link_token.is_suspicious(network, request, True) - - if not suspicious: - # this IP is no longer suspicious: release ip again / delete the counter of this IP - drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed) - return None - - # this IP is suspicious: count requests from this IP - c = incr_sliding_window( - redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW - ) - if c > SUSPICIOUS_IP_MAX: - logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network) - return flask.redirect(flask.url_for('index'), code=302) - - c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) - if c > BURST_MAX_SUSPICIOUS: - return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") - - c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) - if c > LONG_MAX_SUSPICIOUS: - return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") - - return None - - # vanilla limiter without extensions counts BURST_MAX and LONG_MAX - c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) - if c > BURST_MAX: - return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)") - - c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) - if c > LONG_MAX: - return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)") - - return None diff --git a/apps/searxng/searx/botdetection/ip_lists.py b/apps/searxng/searx/botdetection/ip_lists.py deleted file mode 100755 index 456ef43..0000000 --- a/apps/searxng/searx/botdetection/ip_lists.py +++ /dev/null @@ -1,85 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. _botdetection.ip_lists: - -Method ``ip_lists`` -------------------- - -The ``ip_lists`` method implements IP :py:obj:`block- ` and -:py:obj:`pass-lists `. - -.. code:: toml - - [botdetection.ip_lists] - - pass_ip = [ - '140.238.172.132', # IPv4 of check.searx.space - '192.168.0.0/16', # IPv4 private network - 'fe80::/10' # IPv6 linklocal - ] - block_ip = [ - '93.184.216.34', # IPv4 of example.org - '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class - ] - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -from typing import Tuple -from ipaddress import ( - ip_network, - IPv4Address, - IPv6Address, -) - -from searx.tools import config -from ._helpers import logger - -logger = logger.getChild('ip_limit') - -SEARXNG_ORG = [ - # https://github.com/searxng/searxng/pull/2484#issuecomment-1576639195 - '140.238.172.132', # IPv4 check.searx.space - '2603:c022:0:4900::/56', # IPv6 check.searx.space -] -"""Passlist of IPs from the SearXNG organization, e.g. `check.searx.space`.""" - - -def pass_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]: - """Checks if the IP on the subnet is in one of the members of the - ``botdetection.ip_lists.pass_ip`` list. - """ - - if cfg.get('botdetection.ip_lists.pass_searxng_org', default=True): - for net in SEARXNG_ORG: - net = ip_network(net, strict=False) - if real_ip.version == net.version and real_ip in net: - return True, f"IP matches {net.compressed} in SEARXNG_ORG list." - return ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.pass_ip', cfg) - - -def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]: - """Checks if the IP on the subnet is in one of the members of the - ``botdetection.ip_lists.block_ip`` list. - """ - - block, msg = ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.block_ip', cfg) - if block: - msg += " To remove IP from list, please contact the maintainer of the service." - return block, msg - - -def ip_is_subnet_of_member_in_list( - real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config -) -> Tuple[bool, str]: - - for net in cfg.get(list_name, default=[]): - try: - net = ip_network(net, strict=False) - except ValueError: - logger.error("invalid IP %s in %s", net, list_name) - continue - if real_ip.version == net.version and real_ip in net: - return True, f"IP matches {net.compressed} in {list_name}." - return False, f"IP is not a member of an item in the f{list_name} list" diff --git a/apps/searxng/searx/botdetection/limiter.py b/apps/searxng/searx/botdetection/limiter.py deleted file mode 100755 index 3666658..0000000 --- a/apps/searxng/searx/botdetection/limiter.py +++ /dev/null @@ -1,147 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. _limiter src: - -Limiter -======= - -.. sidebar:: info - - The limiter requires a :ref:`Redis ` database. - -Bot protection / IP rate limitation. The intention of rate limitation is to -limit suspicious requests from an IP. The motivation behind this is the fact -that SearXNG passes through requests from bots and is thus classified as a bot -itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked -by the search engine (the origin) in some other way. - -To avoid blocking, the requests from bots to SearXNG must also be blocked, this -is the task of the limiter. To perform this task, the limiter uses the methods -from the :py:obj:`searx.botdetection`. - -To enable the limiter activate: - -.. code:: yaml - - server: - ... - limiter: true # rate limit the number of request on the instance, block some bots - -and set the redis-url connection. Check the value, it depends on your redis DB -(see :ref:`settings redis`), by example: - -.. code:: yaml - - redis: - url: unix:///usr/local/searxng-redis/run/redis.sock?db=0 - -""" - -from __future__ import annotations - -from pathlib import Path -from ipaddress import ip_address -import flask -import werkzeug - -from searx.tools import config -from searx import logger - -from . import ( - http_accept, - http_accept_encoding, - http_accept_language, - http_connection, - http_user_agent, - ip_limit, - ip_lists, -) - -from ._helpers import ( - get_network, - get_real_ip, - dump_request, -) - -logger = logger.getChild('botdetection.limiter') - -CFG: config.Config = None # type: ignore - -LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" -"""Base configuration (schema) of the botdetection.""" - -LIMITER_CFG = Path('/etc/searxng/limiter.toml') -"""Lokal Limiter configuration.""" - -CFG_DEPRECATED = { - # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." -} - - -def get_cfg() -> config.Config: - global CFG # pylint: disable=global-statement - if CFG is None: - CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED) - return CFG - - -def filter_request(request: flask.Request) -> werkzeug.Response | None: - # pylint: disable=too-many-return-statements - - cfg = get_cfg() - real_ip = ip_address(get_real_ip(request)) - network = get_network(real_ip, cfg) - - if request.path == '/healthz': - return None - - # link-local - - if network.is_link_local: - return None - - # block- & pass- lists - # - # 1. The IP of the request is first checked against the pass-list; if the IP - # matches an entry in the list, the request is not blocked. - # 2. If no matching entry is found in the pass-list, then a check is made against - # the block list; if the IP matches an entry in the list, the request is - # blocked. - # 3. If the IP is not in either list, the request is not blocked. - - match, msg = ip_lists.pass_ip(real_ip, cfg) - if match: - logger.warning("PASS %s: matched PASSLIST - %s", network.compressed, msg) - return None - - match, msg = ip_lists.block_ip(real_ip, cfg) - if match: - logger.error("BLOCK %s: matched BLOCKLIST - %s", network.compressed, msg) - return flask.make_response(('IP is on BLOCKLIST - %s' % msg, 429)) - - # methods applied on / - - for func in [ - http_user_agent, - ]: - val = func.filter_request(network, request, cfg) - if val is not None: - return val - - # methods applied on /search - - if request.path == '/search': - - for func in [ - http_accept, - http_accept_encoding, - http_accept_language, - http_connection, - http_user_agent, - ip_limit, - ]: - val = func.filter_request(network, request, cfg) - if val is not None: - return val - logger.debug(f"OK {network}: %s", dump_request(flask.request)) - return None diff --git a/apps/searxng/searx/botdetection/link_token.py b/apps/searxng/searx/botdetection/link_token.py deleted file mode 100755 index d86fa86..0000000 --- a/apps/searxng/searx/botdetection/link_token.py +++ /dev/null @@ -1,157 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``link_token`` ---------------------- - -The ``link_token`` method evaluates a request as :py:obj:`suspicious -` if the URL ``/client.css`` is not requested by the -client. By adding a random component (the token) in the URL, a bot can not send -a ping by request a static URL. - -.. note:: - - This method requires a redis DB and needs a HTTP X-Forwarded-For_ header. - -To get in use of this method a flask URL route needs to be added: - -.. code:: python - - @app.route('/client.css', methods=['GET', 'POST']) - def client_token(token=None): - link_token.ping(request, token) - return Response('', mimetype='text/css') - -And in the HTML template from flask a stylesheet link is needed (the value of -``link_token`` comes from :py:obj:`get_token`): - -.. code:: html - - - -.. _X-Forwarded-For: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For - -""" -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, - ip_address, -) - -import string -import random -import flask - -from searx import logger -from searx import redisdb -from searx.redislib import secret_hash - -from ._helpers import ( - get_network, - get_real_ip, -) - -TOKEN_LIVE_TIME = 600 -"""Livetime (sec) of limiter's CSS token.""" - -PING_LIVE_TIME = 3600 -"""Livetime (sec) of the ping-key from a client (request)""" - -PING_KEY = 'SearXNG_limiter.ping' -"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`""" - -TOKEN_KEY = 'SearXNG_limiter.token' -"""Key for which the current token is stored in the DB""" - -logger = logger.getChild('botdetection.link_token') - - -def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False): - """Checks whether a valid ping is exists for this (client) network, if not - this request is rated as *suspicious*. If a valid ping exists and argument - ``renew`` is ``True`` the expire time of this ping is reset to - :py:obj:`PING_LIVE_TIME`. - - """ - redis_client = redisdb.client() - if not redis_client: - return False - - ping_key = get_ping_key(network, request) - if not redis_client.get(ping_key): - logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key) - return True - - if renew: - redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) - - logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key) - return False - - -def ping(request: flask.Request, token: str): - """This function is called by a request to URL ``/client.css``. If - ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB. - The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. - - """ - from . import limiter # pylint: disable=import-outside-toplevel, cyclic-import - - redis_client = redisdb.client() - if not redis_client: - return - if not token_is_valid(token): - return - - cfg = limiter.get_cfg() - real_ip = ip_address(get_real_ip(request)) - network = get_network(real_ip, cfg) - - ping_key = get_ping_key(network, request) - logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key) - redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) - - -def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str: - """Generates a hashed key that fits (more or less) to a *WEB-browser - session* in a network.""" - return ( - PING_KEY - + "[" - + secret_hash( - network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') - ) - + "]" - ) - - -def token_is_valid(token) -> bool: - valid = token == get_token() - logger.debug("token is valid --> %s", valid) - return valid - - -def get_token() -> str: - """Returns current token. If there is no currently active token a new token - is generated randomly and stored in the redis DB. - - - :py:obj:`TOKEN_LIVE_TIME` - - :py:obj:`TOKEN_KEY` - - """ - redis_client = redisdb.client() - if not redis_client: - # This function is also called when limiter is inactive / no redis DB - # (see render function in webapp.py) - return '12345678' - token = redis_client.get(TOKEN_KEY) - if token: - token = token.decode('UTF-8') - else: - token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) - redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME) - return token diff --git a/apps/searxng/searx/compat.py b/apps/searxng/searx/compat.py deleted file mode 100755 index 15e27d4..0000000 --- a/apps/searxng/searx/compat.py +++ /dev/null @@ -1,73 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pyright: basic -"""Module for backward compatibility. - -""" -# pylint: disable=C,R - - -__all__ = ('cached_property',) - - -try: - from functools import cached_property # type: ignore - -except ImportError: - - # cache_property has been added in py3.8 [1] - # - # To support cache_property in py3.7 the implementation from 3.8 has been - # copied here. This code can be cleanup with EOL of py3.7. - # - # [1] https://docs.python.org/3/library/functools.html#functools.cached_property - - from threading import RLock - - _NOT_FOUND = object() - - class cached_property: - def __init__(self, func): - self.func = func - self.attrname = None - self.__doc__ = func.__doc__ - self.lock = RLock() - - def __set_name__(self, owner, name): - if self.attrname is None: - self.attrname = name - elif name != self.attrname: - raise TypeError( - "Cannot assign the same cached_property to two different names " - f"({self.attrname!r} and {name!r})." - ) - - def __get__(self, instance, owner=None): - if instance is None: - return self - if self.attrname is None: - raise TypeError("Cannot use cached_property instance without calling __set_name__ on it.") - try: - cache = instance.__dict__ - except AttributeError: # not all objects have __dict__ (e.g. class defines slots) - msg = ( - f"No '__dict__' attribute on {type(instance).__name__!r} " - f"instance to cache {self.attrname!r} property." - ) - raise TypeError(msg) from None - val = cache.get(self.attrname, _NOT_FOUND) - if val is _NOT_FOUND: - with self.lock: - # check if another thread filled cache while we awaited lock - val = cache.get(self.attrname, _NOT_FOUND) - if val is _NOT_FOUND: - val = self.func(instance) - try: - cache[self.attrname] = val - except TypeError: - msg = ( - f"The '__dict__' attribute on {type(instance).__name__!r} instance " - f"does not support item assignment for caching {self.attrname!r} property." - ) - raise TypeError(msg) from None - return val diff --git a/apps/searxng/searx/data/__init__.py b/apps/searxng/searx/data/__init__.py deleted file mode 100755 index 0822f4a..0000000 --- a/apps/searxng/searx/data/__init__.py +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This module holds the *data* created by:: - - make data.all - -""" - -__all__ = [ - 'ENGINE_TRAITS', - 'CURRENCIES', - 'USER_AGENTS', - 'EXTERNAL_URLS', - 'WIKIDATA_UNITS', - 'EXTERNAL_BANGS', - 'OSM_KEYS_TAGS', - 'ENGINE_DESCRIPTIONS', - 'ahmia_blacklist_loader', -] - -import json -from pathlib import Path - -data_dir = Path(__file__).parent - - -def _load(filename): - with open(data_dir / filename, encoding='utf-8') as f: - return json.load(f) - - -def ahmia_blacklist_loader(): - """Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion - names. The MD5 values are fetched by:: - - searxng_extra/update/update_ahmia_blacklist.py - - This function is used by :py:mod:`searx.plugins.ahmia_filter`. - - """ - with open(data_dir / 'ahmia_blacklist.txt', encoding='utf-8') as f: - return f.read().split() - - -CURRENCIES = _load('currencies.json') -USER_AGENTS = _load('useragents.json') -EXTERNAL_URLS = _load('external_urls.json') -WIKIDATA_UNITS = _load('wikidata_units.json') -EXTERNAL_BANGS = _load('external_bangs.json') -OSM_KEYS_TAGS = _load('osm_keys_tags.json') -ENGINE_DESCRIPTIONS = _load('engine_descriptions.json') -ENGINE_TRAITS = _load('engine_traits.json') diff --git a/apps/searxng/searx/enginelib/__init__.py b/apps/searxng/searx/enginelib/__init__.py deleted file mode 100755 index 6a0bb67..0000000 --- a/apps/searxng/searx/enginelib/__init__.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Implementations of the framework for the SearXNG engines. - -.. hint:: - - The long term goal is to modularize all implementations of the engine - framework here in this Python package. ToDo: - - - move implementations of the :ref:`searx.engines loader` to a new module in - the :py:obj:`searx.enginelib` namespace. - -""" - - -from __future__ import annotations -from typing import List, Callable, TYPE_CHECKING - -if TYPE_CHECKING: - from searx.enginelib import traits - - -class Engine: # pylint: disable=too-few-public-methods - """Class of engine instances build from YAML settings. - - Further documentation see :ref:`general engine configuration`. - - .. hint:: - - This class is currently never initialized and only used for type hinting. - """ - - # Common options in the engine module - - engine_type: str - """Type of the engine (:ref:`searx.search.processors`)""" - - paging: bool - """Engine supports multiple pages.""" - - time_range_support: bool - """Engine supports search time range.""" - - safesearch: bool - """Engine supports SafeSearch""" - - language_support: bool - """Engine supports languages (locales) search.""" - - language: str - """For an engine, when there is ``language: ...`` in the YAML settings the engine - does support only this one language: - - .. code:: yaml - - - name: google french - engine: google - language: fr - """ - - region: str - """For an engine, when there is ``region: ...`` in the YAML settings the engine - does support only this one region:: - - .. code:: yaml - - - name: google belgium - engine: google - region: fr-BE - """ - - fetch_traits: Callable - """Function to to fetch engine's traits from origin.""" - - traits: traits.EngineTraits - """Traits of the engine.""" - - # settings.yml - - categories: List[str] - """Specifies to which :ref:`engine categories` the engine should be added.""" - - name: str - """Name that will be used across SearXNG to define this engine. In settings, on - the result page ..""" - - engine: str - """Name of the python file used to handle requests and responses to and from - this search engine (file name from :origin:`searx/engines` without - ``.py``).""" - - enable_http: bool - """Enable HTTP (by default only HTTPS is enabled).""" - - shortcut: str - """Code used to execute bang requests (``!foo``)""" - - timeout: float - """Specific timeout for search-engine.""" - - display_error_messages: bool - """Display error messages on the web UI.""" - - proxies: dict - """Set proxies for a specific engine (YAML): - - .. code:: yaml - - proxies : - http: socks5://proxy:port - https: socks5://proxy:port - """ - - disabled: bool - """To disable by default the engine, but not deleting it. It will allow the - user to manually activate it in the settings.""" - - inactive: bool - """Remove the engine from the settings (*disabled & removed*).""" - - about: dict - """Additional fileds describing the engine. - - .. code:: yaml - - about: - website: https://example.com - wikidata_id: Q306656 - official_api_documentation: https://example.com/api-doc - use_official_api: true - require_api_key: true - results: HTML - """ - - using_tor_proxy: bool - """Using tor proxy (``true``) or not (``false``) for this engine.""" - - send_accept_language_header: bool - """When this option is activated, the language (locale) that is selected by - the user is used to build and send a ``Accept-Language`` header in the - request to the origin search engine.""" - - tokens: List[str] - """A list of secret tokens to make this engine *private*, more details see - :ref:`private engines`.""" diff --git a/apps/searxng/searx/enginelib/traits.py b/apps/searxng/searx/enginelib/traits.py deleted file mode 100755 index 6402fde..0000000 --- a/apps/searxng/searx/enginelib/traits.py +++ /dev/null @@ -1,252 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Engine's traits are fetched from the origin engines and stored in a JSON file -in the *data folder*. Most often traits are languages and region codes and -their mapping from SearXNG's representation to the representation in the origin -search engine. For new traits new properties can be added to the class -:py:class:`EngineTraits`. - -To load traits from the persistence :py:obj:`EngineTraitsMap.from_data` can be -used. -""" - -from __future__ import annotations -import json -import dataclasses -import types -from typing import Dict, Iterable, Union, Callable, Optional, TYPE_CHECKING -from typing_extensions import Literal, Self - -from searx import locales -from searx.data import data_dir, ENGINE_TRAITS - -if TYPE_CHECKING: - from . import Engine - - -class EngineTraitsEncoder(json.JSONEncoder): - """Encodes :class:`EngineTraits` to a serializable object, see - :class:`json.JSONEncoder`.""" - - def default(self, o): - """Return dictionary of a :class:`EngineTraits` object.""" - if isinstance(o, EngineTraits): - return o.__dict__ - return super().default(o) - - -@dataclasses.dataclass -class EngineTraits: - """The class is intended to be instantiated for each engine.""" - - regions: Dict[str, str] = dataclasses.field(default_factory=dict) - """Maps SearXNG's internal representation of a region to the one of the engine. - - SearXNG's internal representation can be parsed by babel and the value is - send to the engine: - - .. code:: python - - regions ={ - 'fr-BE' : , - } - - for key, egnine_region regions.items(): - searxng_region = babel.Locale.parse(key, sep='-') - ... - """ - - languages: Dict[str, str] = dataclasses.field(default_factory=dict) - """Maps SearXNG's internal representation of a language to the one of the engine. - - SearXNG's internal representation can be parsed by babel and the value is - send to the engine: - - .. code:: python - - languages = { - 'ca' : , - } - - for key, egnine_lang in languages.items(): - searxng_lang = babel.Locale.parse(key) - ... - """ - - all_locale: Optional[str] = None - """To which locale value SearXNG's ``all`` language is mapped (shown a "Default - language"). - """ - - data_type: Literal['traits_v1'] = 'traits_v1' - """Data type, default is 'traits_v1'. - """ - - custom: Dict[str, Union[Dict[str, Dict], Iterable[str]]] = dataclasses.field(default_factory=dict) - """A place to store engine's custom traits, not related to the SearXNG core. - """ - - def get_language(self, searxng_locale: str, default=None): - """Return engine's language string that *best fits* to SearXNG's locale. - - :param searxng_locale: SearXNG's internal representation of locale - selected by the user. - - :param default: engine's default language - - The *best fits* rules are implemented in - :py:obj:`searx.locales.get_engine_locale`. Except for the special value ``all`` - which is determined from :py:obj:`EngineTraits.all_locale`. - """ - if searxng_locale == 'all' and self.all_locale is not None: - return self.all_locale - return locales.get_engine_locale(searxng_locale, self.languages, default=default) - - def get_region(self, searxng_locale: str, default=None): - """Return engine's region string that best fits to SearXNG's locale. - - :param searxng_locale: SearXNG's internal representation of locale - selected by the user. - - :param default: engine's default region - - The *best fits* rules are implemented in - :py:obj:`searx.locales.get_engine_locale`. Except for the special value ``all`` - which is determined from :py:obj:`EngineTraits.all_locale`. - """ - if searxng_locale == 'all' and self.all_locale is not None: - return self.all_locale - return locales.get_engine_locale(searxng_locale, self.regions, default=default) - - def is_locale_supported(self, searxng_locale: str) -> bool: - """A *locale* (SearXNG's internal representation) is considered to be - supported by the engine if the *region* or the *language* is supported - by the engine. - - For verification the functions :py:func:`EngineTraits.get_region` and - :py:func:`EngineTraits.get_language` are used. - """ - if self.data_type == 'traits_v1': - return bool(self.get_region(searxng_locale) or self.get_language(searxng_locale)) - - raise TypeError('engine traits of type %s is unknown' % self.data_type) - - def copy(self): - """Create a copy of the dataclass object.""" - return EngineTraits(**dataclasses.asdict(self)) - - @classmethod - def fetch_traits(cls, engine: Engine) -> Union[Self, None]: - """Call a function ``fetch_traits(engine_traits)`` from engines namespace to fetch - and set properties from the origin engine in the object ``engine_traits``. If - function does not exists, ``None`` is returned. - """ - - fetch_traits = getattr(engine, 'fetch_traits', None) - engine_traits = None - - if fetch_traits: - engine_traits = cls() - fetch_traits(engine_traits) - return engine_traits - - def set_traits(self, engine: Engine): - """Set traits from self object in a :py:obj:`.Engine` namespace. - - :param engine: engine instance build by :py:func:`searx.engines.load_engine` - """ - - if self.data_type == 'traits_v1': - self._set_traits_v1(engine) - else: - raise TypeError('engine traits of type %s is unknown' % self.data_type) - - def _set_traits_v1(self, engine: Engine): - # For an engine, when there is `language: ...` in the YAML settings the engine - # does support only this one language (region):: - # - # - name: google italian - # engine: google - # language: it - # region: it-IT - - traits = self.copy() - - _msg = "settings.yml - engine: '%s' / %s: '%s' not supported" - - languages = traits.languages - if hasattr(engine, 'language'): - if engine.language not in languages: - raise ValueError(_msg % (engine.name, 'language', engine.language)) - traits.languages = {engine.language: languages[engine.language]} - - regions = traits.regions - if hasattr(engine, 'region'): - if engine.region not in regions: - raise ValueError(_msg % (engine.name, 'region', engine.region)) - traits.regions = {engine.region: regions[engine.region]} - - engine.language_support = bool(traits.languages or traits.regions) - - # set the copied & modified traits in engine's namespace - engine.traits = traits - - -class EngineTraitsMap(Dict[str, EngineTraits]): - """A python dictionary to map :class:`EngineTraits` by engine name.""" - - ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve() - """File with persistence of the :py:obj:`EngineTraitsMap`.""" - - def save_data(self): - """Store EngineTraitsMap in in file :py:obj:`self.ENGINE_TRAITS_FILE`""" - with open(self.ENGINE_TRAITS_FILE, 'w', encoding='utf-8') as f: - json.dump(self, f, indent=2, sort_keys=True, cls=EngineTraitsEncoder) - - @classmethod - def from_data(cls) -> Self: - """Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`""" - obj = cls() - for k, v in ENGINE_TRAITS.items(): - obj[k] = EngineTraits(**v) - return obj - - @classmethod - def fetch_traits(cls, log: Callable) -> Self: - from searx import engines # pylint: disable=cyclic-import, import-outside-toplevel - - names = list(engines.engines) - names.sort() - obj = cls() - - for engine_name in names: - engine = engines.engines[engine_name] - - traits = EngineTraits.fetch_traits(engine) - if traits is not None: - log("%-20s: SearXNG languages --> %s " % (engine_name, len(traits.languages))) - log("%-20s: SearXNG regions --> %s" % (engine_name, len(traits.regions))) - obj[engine_name] = traits - - return obj - - def set_traits(self, engine: Engine | types.ModuleType): - """Set traits in a :py:obj:`Engine` namespace. - - :param engine: engine instance build by :py:func:`searx.engines.load_engine` - """ - - engine_traits = EngineTraits(data_type='traits_v1') - if engine.name in self.keys(): - engine_traits = self[engine.name] - - elif engine.engine in self.keys(): - # The key of the dictionary traits_map is the *engine name* - # configured in settings.xml. When multiple engines are configured - # in settings.yml to use the same origin engine (python module) - # these additional engines can use the languages from the origin - # engine. For this use the configured ``engine: ...`` from - # settings.yml - engine_traits = self[engine.engine] - - engine_traits.set_traits(engine) diff --git a/apps/searxng/searx/engines/1337x.py b/apps/searxng/searx/engines/1337x.py deleted file mode 100755 index 730a4c4..0000000 --- a/apps/searxng/searx/engines/1337x.py +++ /dev/null @@ -1,57 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - 1337x -""" - -from urllib.parse import quote, urljoin -from lxml import html -from searx.utils import extract_text, get_torrent_size, eval_xpath, eval_xpath_list, eval_xpath_getindex - -# about -about = { - "website": 'https://1337x.to/', - "wikidata_id": 'Q28134166', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -url = 'https://1337x.to/' -search_url = url + 'search/{search_term}/{pageno}/' -categories = ['files'] -paging = True - - -def request(query, params): - params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno']) - - return params - - -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - for result in eval_xpath_list(dom, '//table[contains(@class, "table-list")]/tbody//tr'): - href = urljoin(url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0)) - title = extract_text(eval_xpath(result, './td[contains(@class, "name")]/a[2]')) - seed = extract_text(eval_xpath(result, './/td[contains(@class, "seeds")]')) - leech = extract_text(eval_xpath(result, './/td[contains(@class, "leeches")]')) - filesize_info = extract_text(eval_xpath(result, './/td[contains(@class, "size")]/text()')) - filesize, filesize_multiplier = filesize_info.split() - filesize = get_torrent_size(filesize, filesize_multiplier) - - results.append( - { - 'url': href, - 'title': title, - 'seed': seed, - 'leech': leech, - 'filesize': filesize, - 'template': 'torrent.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/9gag.py b/apps/searxng/searx/engines/9gag.py deleted file mode 100755 index d184672..0000000 --- a/apps/searxng/searx/engines/9gag.py +++ /dev/null @@ -1,77 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=invalid-name -"""9GAG (social media)""" - -from json import loads -from datetime import datetime -from urllib.parse import urlencode - -about = { - "website": 'https://9gag.com/', - "wikidata_id": 'Q277421', - "official_api_documentation": None, - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['social media'] -paging = True - -search_url = "https://9gag.com/v1/search-posts?{query}" -page_size = 10 - - -def request(query, params): - query = urlencode({'query': query, 'c': (params['pageno'] - 1) * page_size}) - - params['url'] = search_url.format(query=query) - - return params - - -def response(resp): - results = [] - - json_results = loads(resp.text)['data'] - - for result in json_results['posts']: - result_type = result['type'] - - # Get the not cropped version of the thumbnail when the image height is not too important - if result['images']['image700']['height'] > 400: - thumbnail = result['images']['imageFbThumbnail']['url'] - else: - thumbnail = result['images']['image700']['url'] - - if result_type == 'Photo': - results.append( - { - 'template': 'images.html', - 'url': result['url'], - 'title': result['title'], - 'content': result['description'], - 'publishedDate': datetime.utcfromtimestamp(result['creationTs']), - 'img_src': result['images']['image700']['url'], - 'thumbnail_src': thumbnail, - } - ) - elif result_type == 'Animated': - results.append( - { - 'template': 'videos.html', - 'url': result['url'], - 'title': result['title'], - 'content': result['description'], - 'publishedDate': datetime.utcfromtimestamp(result['creationTs']), - 'thumbnail': thumbnail, - 'iframe_src': result['images'].get('image460sv', {}).get('url'), - } - ) - - if 'tags' in json_results: - for suggestion in json_results['tags']: - results.append({'suggestion': suggestion['key']}) - - return results diff --git a/apps/searxng/searx/engines/__init__.py b/apps/searxng/searx/engines/__init__.py deleted file mode 100755 index da2b203..0000000 --- a/apps/searxng/searx/engines/__init__.py +++ /dev/null @@ -1,253 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Load and initialize the ``engines``, see :py:func:`load_engines` and register -:py:obj:`engine_shortcuts`. - -usage:: - - load_engines( settings['engines'] ) - -""" - -from __future__ import annotations - -import sys -import copy -from os.path import realpath, dirname - -from typing import TYPE_CHECKING, Dict -import types -import inspect - -from searx import logger, settings -from searx.utils import load_module - -if TYPE_CHECKING: - from searx.enginelib import Engine - -logger = logger.getChild('engines') -ENGINE_DIR = dirname(realpath(__file__)) -ENGINE_DEFAULT_ARGS = { - # Common options in the engine module - "engine_type": "online", - "paging": False, - "time_range_support": False, - "safesearch": False, - # settings.yml - "categories": ["general"], - "enable_http": False, - "shortcut": "-", - "timeout": settings["outgoing"]["request_timeout"], - "display_error_messages": True, - "disabled": False, - "inactive": False, - "about": {}, - "using_tor_proxy": False, - "send_accept_language_header": False, - "tokens": [], -} -# set automatically when an engine does not have any tab category -DEFAULT_CATEGORY = 'other' - - -# Defaults for the namespace of an engine module, see :py:func:`load_engine` - -categories = {'general': []} -engines: Dict[str, Engine | types.ModuleType] = {} -engine_shortcuts = {} -"""Simple map of registered *shortcuts* to name of the engine (or ``None``). - -:: - - engine_shortcuts[engine.shortcut] = engine.name - -:meta hide-value: -""" - - -def check_engine_module(module: types.ModuleType): - # probe unintentional name collisions / for example name collisions caused - # by import statements in the engine module .. - - # network: https://github.com/searxng/searxng/issues/762#issuecomment-1605323861 - obj = getattr(module, 'network', None) - if obj and inspect.ismodule(obj): - msg = f'type of {module.__name__}.network is a module ({obj.__name__}), expected a string' - # logger.error(msg) - raise TypeError(msg) - - -def load_engine(engine_data: dict) -> Engine | types.ModuleType | None: - """Load engine from ``engine_data``. - - :param dict engine_data: Attributes from YAML ``settings:engines/`` - :return: initialized namespace of the ````. - - 1. create a namespace and load module of the ```` - 2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS` - 3. update namespace with values from ``engine_data`` - - If engine *is active*, return namespace of the engine, otherwise return - ``None``. - - This function also returns ``None`` if initialization of the namespace fails - for one of the following reasons: - - - engine name contains underscore - - engine name is not lowercase - - required attribute is not set :py:func:`is_missing_required_attributes` - - """ - # pylint: disable=too-many-return-statements - - engine_name = engine_data.get('name') - if engine_name is None: - logger.error('An engine does not have a "name" field') - return None - if '_' in engine_name: - logger.error('Engine name contains underscore: "{}"'.format(engine_name)) - return None - - if engine_name.lower() != engine_name: - logger.warning('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name)) - engine_name = engine_name.lower() - engine_data['name'] = engine_name - - # load_module - module_name = engine_data.get('engine') - if module_name is None: - logger.error('The "engine" field is missing for the engine named "{}"'.format(engine_name)) - return None - try: - engine = load_module(module_name + '.py', ENGINE_DIR) - except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError): - logger.exception('Fatal exception in engine "{}"'.format(module_name)) - sys.exit(1) - except BaseException: - logger.exception('Cannot load engine "{}"'.format(module_name)) - return None - - check_engine_module(engine) - update_engine_attributes(engine, engine_data) - update_attributes_for_tor(engine) - - # avoid cyclic imports - # pylint: disable=import-outside-toplevel - from searx.enginelib.traits import EngineTraitsMap - - trait_map = EngineTraitsMap.from_data() - trait_map.set_traits(engine) - - if not is_engine_active(engine): - return None - - if is_missing_required_attributes(engine): - return None - - set_loggers(engine, engine_name) - - if not any(cat in settings['categories_as_tabs'] for cat in engine.categories): - engine.categories.append(DEFAULT_CATEGORY) - - return engine - - -def set_loggers(engine, engine_name): - # set the logger for engine - engine.logger = logger.getChild(engine_name) - # the engine may have load some other engines - # may sure the logger is initialized - # use sys.modules.copy() to avoid "RuntimeError: dictionary changed size during iteration" - # see https://github.com/python/cpython/issues/89516 - # and https://docs.python.org/3.10/library/sys.html#sys.modules - modules = sys.modules.copy() - for module_name, module in modules.items(): - if ( - module_name.startswith("searx.engines") - and module_name != "searx.engines.__init__" - and not hasattr(module, "logger") - ): - module_engine_name = module_name.split(".")[-1] - module.logger = logger.getChild(module_engine_name) # type: ignore - - -def update_engine_attributes(engine: Engine | types.ModuleType, engine_data): - # set engine attributes from engine_data - for param_name, param_value in engine_data.items(): - if param_name == 'categories': - if isinstance(param_value, str): - param_value = list(map(str.strip, param_value.split(','))) - engine.categories = param_value # type: ignore - elif hasattr(engine, 'about') and param_name == 'about': - engine.about = {**engine.about, **engine_data['about']} # type: ignore - else: - setattr(engine, param_name, param_value) - - # set default attributes - for arg_name, arg_value in ENGINE_DEFAULT_ARGS.items(): - if not hasattr(engine, arg_name): - setattr(engine, arg_name, copy.deepcopy(arg_value)) - - -def update_attributes_for_tor(engine: Engine | types.ModuleType): - if using_tor_proxy(engine) and hasattr(engine, 'onion_url'): - engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') # type: ignore - engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) # type: ignore - - -def is_missing_required_attributes(engine): - """An attribute is required when its name doesn't start with ``_`` (underline). - Required attributes must not be ``None``. - - """ - missing = False - for engine_attr in dir(engine): - if not engine_attr.startswith('_') and getattr(engine, engine_attr) is None: - logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr)) - missing = True - return missing - - -def using_tor_proxy(engine: Engine | types.ModuleType): - """Return True if the engine configuration declares to use Tor.""" - return settings['outgoing'].get('using_tor_proxy') or getattr(engine, 'using_tor_proxy', False) - - -def is_engine_active(engine: Engine | types.ModuleType): - # check if engine is inactive - if engine.inactive is True: - return False - - # exclude onion engines if not using tor - if 'onions' in engine.categories and not using_tor_proxy(engine): - return False - - return True - - -def register_engine(engine: Engine | types.ModuleType): - if engine.name in engines: - logger.error('Engine config error: ambiguous name: {0}'.format(engine.name)) - sys.exit(1) - engines[engine.name] = engine - - if engine.shortcut in engine_shortcuts: - logger.error('Engine config error: ambiguous shortcut: {0}'.format(engine.shortcut)) - sys.exit(1) - engine_shortcuts[engine.shortcut] = engine.name - - for category_name in engine.categories: - categories.setdefault(category_name, []).append(engine) - - -def load_engines(engine_list): - """usage: ``engine_list = settings['engines']``""" - engines.clear() - engine_shortcuts.clear() - categories.clear() - categories['general'] = [] - for engine_data in engine_list: - engine = load_engine(engine_data) - if engine: - register_engine(engine) - return engines diff --git a/apps/searxng/searx/engines/ahmia.py b/apps/searxng/searx/engines/ahmia.py deleted file mode 100755 index 33e0cc3..0000000 --- a/apps/searxng/searx/engines/ahmia.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Ahmia (Onions) -""" - -from urllib.parse import urlencode, urlparse, parse_qs -from lxml.html import fromstring -from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath - -# about -about = { - "website": 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion', - "wikidata_id": 'Q18693938', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine config -categories = ['onions'] -paging = True -page_size = 10 - -# search url -search_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?{query}' -time_range_support = True -time_range_dict = {'day': 1, 'week': 7, 'month': 30} - -# xpaths -results_xpath = '//li[@class="result"]' -url_xpath = './h4/a/@href' -title_xpath = './h4/a[1]' -content_xpath = './/p[1]' -correction_xpath = '//*[@id="didYouMean"]//a' -number_of_results_xpath = '//*[@id="totalResults"]' - - -def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query})) - - if params['time_range'] in time_range_dict: - params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]}) - - return params - - -def response(resp): - results = [] - dom = fromstring(resp.text) - - # trim results so there's not way too many at once - first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1) - all_results = eval_xpath_list(dom, results_xpath) - trimmed_results = all_results[first_result_index : first_result_index + page_size] - - # get results - for result in trimmed_results: - # remove ahmia url and extract the actual url for the result - raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) - cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0] - - title = extract_text(eval_xpath(result, title_xpath)) - content = extract_text(eval_xpath(result, content_xpath)) - - results.append({'url': cleaned_url, 'title': title, 'content': content, 'is_onion': True}) - - # get spelling corrections - for correction in eval_xpath_list(dom, correction_xpath): - results.append({'correction': extract_text(correction)}) - - # get number of results - number_of_results = eval_xpath(dom, number_of_results_xpath) - if number_of_results: - try: - results.append({'number_of_results': int(extract_text(number_of_results))}) - except: - pass - - return results diff --git a/apps/searxng/searx/engines/annas_archive.py b/apps/searxng/searx/engines/annas_archive.py deleted file mode 100755 index 1bcdeee..0000000 --- a/apps/searxng/searx/engines/annas_archive.py +++ /dev/null @@ -1,187 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""`Anna's Archive`_ is a free non-profit online shadow library metasearch -engine providing access to a variety of book resources (also via IPFS), created -by a team of anonymous archivists (AnnaArchivist_). - -.. _Anna's Archive: https://annas-archive.org/ -.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive - -Configuration -============= - -The engine has the following additional settings: - -- :py:obj:`aa_content` -- :py:obj:`aa_ext` -- :py:obj:`aa_sort` - -With this options a SearXNG maintainer is able to configure **additional** -engines for specific searches in Anna's Archive. For example a engine to search -for *newest* articles and journals (PDF) / by shortcut ``!aaa ``. - -.. code:: yaml - - - name: annas articles - engine: annas_archive - shortcut: aaa - aa_content: 'journal_article' - aa_ext: 'pdf' - aa_sort: 'newest' - -Implementations -=============== - -""" - -from typing import List, Dict, Any, Optional -from urllib.parse import quote -from lxml import html - -from searx.utils import extract_text, eval_xpath, eval_xpath_list -from searx.enginelib.traits import EngineTraits -from searx.data import ENGINE_TRAITS - -# about -about: Dict[str, Any] = { - "website": "https://annas-archive.org/", - "wikidata_id": "Q115288326", - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": "HTML", -} - -# engine dependent config -categories: List[str] = ["files"] -paging: bool = False - -# search-url -base_url: str = "https://annas-archive.org" -aa_content: str = "" -"""Anan's search form field **Content** / possible values:: - - journal_article, book_any, book_fiction, book_unknown, book_nonfiction, - book_comic, magazine, standards_document - -To not filter use an empty string (default). -""" -aa_sort: str = '' -"""Sort Anna's results, possible values:: - - newest, oldest, largest, smallest - -To sort by *most relevant* use an empty string (default).""" - -aa_ext: str = '' -"""Filter Anna's results by a file ending. Common filters for example are -``pdf`` and ``epub``. - -.. note:: - - Anna's Archive is a beta release: Filter results by file extension does not - really work on Anna's Archive. - -""" - - -def init(engine_settings=None): # pylint: disable=unused-argument - """Check of engine's settings.""" - traits = EngineTraits(**ENGINE_TRAITS['annas archive']) - - if aa_content and aa_content not in traits.custom['content']: - raise ValueError(f'invalid setting content: {aa_content}') - - if aa_sort and aa_sort not in traits.custom['sort']: - raise ValueError(f'invalid setting sort: {aa_sort}') - - if aa_ext and aa_ext not in traits.custom['ext']: - raise ValueError(f'invalid setting ext: {aa_ext}') - - -def request(query, params: Dict[str, Any]) -> Dict[str, Any]: - q = quote(query) - lang = traits.get_language(params["language"], traits.all_locale) # type: ignore - params["url"] = base_url + f"/search?lang={lang or ''}&content={aa_content}&ext={aa_ext}&sort={aa_sort}&q={q}" - return params - - -def response(resp) -> List[Dict[str, Optional[str]]]: - results: List[Dict[str, Optional[str]]] = [] - dom = html.fromstring(resp.text) - - for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'): - results.append(_get_result(item)) - - # The rendering of the WEB page is very strange; except the first position - # all other positions of Anna's result page are enclosed in SGML comments. - # These comments are *uncommented* by some JS code, see query of class - # '.js-scroll-hidden' in Anna's HTML template: - # https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html - - for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'): - item = html.fromstring(item.xpath('./comment()')[0].text) - results.append(_get_result(item)) - - return results - - -def _get_result(item): - return { - 'template': 'paper.html', - 'url': base_url + item.xpath('./@href')[0], - 'title': extract_text(eval_xpath(item, './/h3/text()[1]')), - 'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')), - 'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))], - 'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')), - 'img_src': item.xpath('.//img/@src')[0], - } - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages and other search arguments from Anna's search form.""" - # pylint: disable=import-outside-toplevel - - import babel - from searx.network import get # see https://github.com/searxng/searxng/issues/762 - from searx.locales import language_tag - - engine_traits.all_locale = '' - engine_traits.custom['content'] = [] - engine_traits.custom['ext'] = [] - engine_traits.custom['sort'] = [] - - resp = get(base_url + '/search') - if not resp.ok: # type: ignore - raise RuntimeError("Response from Anna's search page is not OK.") - dom = html.fromstring(resp.text) # type: ignore - - # supported language codes - - lang_map = {} - for x in eval_xpath_list(dom, "//form//select[@name='lang']//option"): - eng_lang = x.get("value") - if eng_lang in ('', '_empty', 'nl-BE', 'und'): - continue - try: - locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') - except babel.UnknownLocaleError: - # silently ignore unknown languages - # print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) - continue - sxng_lang = language_tag(locale) - conflict = engine_traits.languages.get(sxng_lang) - if conflict: - if conflict != eng_lang: - print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) - continue - engine_traits.languages[sxng_lang] = eng_lang - - for x in eval_xpath_list(dom, "//form//select[@name='content']//option"): - engine_traits.custom['content'].append(x.get("value")) - - for x in eval_xpath_list(dom, "//form//select[@name='ext']//option"): - engine_traits.custom['ext'].append(x.get("value")) - - for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"): - engine_traits.custom['sort'].append(x.get("value")) diff --git a/apps/searxng/searx/engines/apkmirror.py b/apps/searxng/searx/engines/apkmirror.py deleted file mode 100755 index ac7cd74..0000000 --- a/apps/searxng/searx/engines/apkmirror.py +++ /dev/null @@ -1,62 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""APKMirror -""" - -# pylint: disable=invalid-name - -from urllib.parse import urlencode -from lxml import html - -from searx.utils import ( - eval_xpath_list, - eval_xpath_getindex, - extract_text, -) - -about = { - "website": 'https://www.apkmirror.com', - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['files', 'apps'] -paging = True -time_range_support = False - -# search-url -base_url = 'https://www.apkmirror.com' -search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}' - - -def request(query, params): - params['url'] = search_url.format( - pageno=params['pageno'], - query=urlencode({'s': query}), - ) - logger.debug("query_url --> %s", params['url']) - return params - - -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - # parse results - for result in eval_xpath_list(dom, "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']"): - - link = eval_xpath_getindex(result, './/h5/a', 0) - - url = base_url + link.attrib.get('href') + '#downloads' - title = extract_text(link) - img_src = base_url + eval_xpath_getindex(result, './/img/@src', 0) - res = {'url': url, 'title': title, 'img_src': img_src} - - results.append(res) - - return results diff --git a/apps/searxng/searx/engines/apple_app_store.py b/apps/searxng/searx/engines/apple_app_store.py deleted file mode 100755 index f75a1a6..0000000 --- a/apps/searxng/searx/engines/apple_app_store.py +++ /dev/null @@ -1,57 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" - Apple App Store -""" - -from json import loads -from urllib.parse import urlencode -from dateutil.parser import parse - -about = { - "website": 'https://www.apple.com/app-store/', - "wikidata_id": 'Q368215', - "official_api_documentation": ( - 'https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/' - 'iTuneSearchAPI/UnderstandingSearchResults.html#//apple_ref/doc/uid/TP40017632-CH8-SW1' - ), - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['files', 'apps'] -safesearch = True - -search_url = 'https://itunes.apple.com/search?{query}' - - -def request(query, params): - explicit = "Yes" - - if params['safesearch'] > 0: - explicit = "No" - - params['url'] = search_url.format(query=urlencode({'term': query, 'media': 'software', 'explicit': explicit})) - - return params - - -def response(resp): - results = [] - - json_result = loads(resp.text) - - for result in json_result['results']: - results.append( - { - 'url': result['trackViewUrl'], - 'title': result['trackName'], - 'content': result['description'], - 'img_src': result['artworkUrl100'], - 'publishedDate': parse(result['currentVersionReleaseDate']), - 'author': result['sellerName'], - } - ) - - return results diff --git a/apps/searxng/searx/engines/apple_maps.py b/apps/searxng/searx/engines/apple_maps.py deleted file mode 100755 index eb4af42..0000000 --- a/apps/searxng/searx/engines/apple_maps.py +++ /dev/null @@ -1,113 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Apple Maps""" - -from json import loads -from time import time -from urllib.parse import urlencode - -from searx.network import get as http_get -from searx.engines.openstreetmap import get_key_label - -about = { - "website": 'https://www.apple.com/maps/', - "wikidata_id": 'Q276101', - "official_api_documentation": None, - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -token = {'value': '', 'last_updated': None} - -categories = ['map'] -paging = False - -search_url = "https://api.apple-mapkit.com/v1/search?{query}&mkjsVersion=5.72.53" - - -def obtain_token(): - update_time = time() - (time() % 1800) - try: - # use duckduckgo's mapkit token - token_response = http_get('https://duckduckgo.com/local.js?get_mk_token=1', timeout=2.0) - actual_token = http_get( - 'https://cdn.apple-mapkit.com/ma/bootstrap?apiVersion=2&mkjsVersion=5.72.53&poi=1', - timeout=2.0, - headers={'Authorization': 'Bearer ' + token_response.text}, - ) - token['value'] = loads(actual_token.text)['authInfo']['access_token'] - token['last_updated'] = update_time - # pylint: disable=bare-except - except: - pass - return token - - -def request(query, params): - if time() - (token['last_updated'] or 0) > 1800: - obtain_token() - - params['url'] = search_url.format(query=urlencode({'q': query, 'lang': params['language']})) - - params['headers'] = {'Authorization': 'Bearer ' + token['value']} - - return params - - -def response(resp): - results = [] - - resp_json = loads(resp.text) - - user_language = resp.search_params['language'] - - for result in resp_json['results']: - boundingbox = None - if 'displayMapRegion' in result: - box = result['displayMapRegion'] - boundingbox = [box['southLat'], box['northLat'], box['westLng'], box['eastLng']] - - links = [] - if 'telephone' in result: - telephone = result['telephone'] - links.append( - { - 'label': get_key_label('phone', user_language), - 'url': 'tel:' + telephone, - 'url_label': telephone, - } - ) - if result.get('urls'): - url = result['urls'][0] - links.append( - { - 'label': get_key_label('website', user_language), - 'url': url, - 'url_label': url, - } - ) - - results.append( - { - 'template': 'map.html', - 'type': result.get('poiCategory'), - 'title': result['name'], - 'links': links, - 'latitude': result['center']['lat'], - 'longitude': result['center']['lng'], - 'url': result['placecardUrl'], - 'boundingbox': boundingbox, - 'geojson': {'type': 'Point', 'coordinates': [result['center']['lng'], result['center']['lat']]}, - 'address': { - 'name': result['name'], - 'house_number': result.get('subThoroughfare'), - 'road': result.get('thoroughfare'), - 'locality': result.get('locality'), - 'postcode': result.get('postCode'), - 'country': result.get('country'), - }, - } - ) - - return results diff --git a/apps/searxng/searx/engines/archlinux.py b/apps/searxng/searx/engines/archlinux.py deleted file mode 100755 index 17bb1b6..0000000 --- a/apps/searxng/searx/engines/archlinux.py +++ /dev/null @@ -1,152 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Arch Linux Wiki -~~~~~~~~~~~~~~~ - -This implementation does not use a official API: Mediawiki provides API, but -Arch Wiki blocks access to it. - -""" - -from typing import TYPE_CHECKING -from urllib.parse import urlencode, urljoin, urlparse -import lxml -import babel - -from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex -from searx.enginelib.traits import EngineTraits -from searx.locales import language_tag - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - - -about = { - "website": 'https://wiki.archlinux.org/', - "wikidata_id": 'Q101445877', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['it', 'software wikis'] -paging = True -main_wiki = 'wiki.archlinux.org' - - -def request(query, params): - - sxng_lang = params['searxng_locale'].split('-')[0] - netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore - title: str = traits.custom['title'].get(sxng_lang, 'Special:Search') # type: ignore - base_url = 'https://' + netloc + '/index.php?' - offset = (params['pageno'] - 1) * 20 - - if netloc == main_wiki: - eng_lang: str = traits.get_language(sxng_lang, 'English') # type: ignore - query += ' (' + eng_lang + ')' - elif netloc == 'wiki.archlinuxcn.org': - base_url = 'https://' + netloc + '/wzh/index.php?' - - args = { - 'search': query, - 'title': title, - 'limit': 20, - 'offset': offset, - 'profile': 'default', - } - - params['url'] = base_url + urlencode(args) - return params - - -def response(resp): - - results = [] - dom = lxml.html.fromstring(resp.text) # type: ignore - - # get the base URL for the language in which request was made - sxng_lang = resp.search_params['searxng_locale'].split('-')[0] - netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore - base_url = 'https://' + netloc + '/index.php?' - - for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'): - link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0) - content = extract_text(result.xpath('.//div[@class="searchresult"]')) - results.append( - { - 'url': urljoin(base_url, link.get('href')), # type: ignore - 'title': extract_text(link), - 'content': content, - } - ) - - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages from Archlinix-Wiki. The location of the Wiki address of a - language is mapped in a :py:obj:`custom field - ` (``wiki_netloc``). Depending - on the location, the ``title`` argument in the request is translated. - - .. code:: python - - "custom": { - "wiki_netloc": { - "de": "wiki.archlinux.de", - # ... - "zh": "wiki.archlinuxcn.org" - } - "title": { - "de": "Spezial:Suche", - # ... - "zh": "Special:\u641c\u7d22" - }, - }, - - """ - # pylint: disable=import-outside-toplevel - from searx.network import get # see https://github.com/searxng/searxng/issues/762 - - engine_traits.custom['wiki_netloc'] = {} - engine_traits.custom['title'] = {} - - title_map = { - 'de': 'Spezial:Suche', - 'fa': 'ویژه:جستجو', - 'ja': '特別:検索', - 'zh': 'Special:搜索', - } - - resp = get('https://wiki.archlinux.org/') - if not resp.ok: # type: ignore - print("ERROR: response from wiki.archlinix.org is not OK.") - - dom = lxml.html.fromstring(resp.text) # type: ignore - for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"): - - sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-')) - # zh_Hans --> zh - sxng_tag = sxng_tag.split('_')[0] - - netloc = urlparse(a.get('href')).netloc - if netloc != 'wiki.archlinux.org': - title = title_map.get(sxng_tag) - if not title: - print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag)) - continue - engine_traits.custom['wiki_netloc'][sxng_tag] = netloc - engine_traits.custom['title'][sxng_tag] = title # type: ignore - - eng_tag = extract_text(eval_xpath_list(a, ".//span")) - engine_traits.languages[sxng_tag] = eng_tag # type: ignore - - engine_traits.languages['en'] = 'English' diff --git a/apps/searxng/searx/engines/artic.py b/apps/searxng/searx/engines/artic.py deleted file mode 100755 index c0ae0a5..0000000 --- a/apps/searxng/searx/engines/artic.py +++ /dev/null @@ -1,69 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""The Art Institute of Chicago - -Explore thousands of artworks from The Art Institute of Chicago. - -* https://artic.edu - -""" - -from json import loads -from urllib.parse import urlencode - -about = { - "website": 'https://www.artic.edu', - "wikidata_id": 'Q239303', - "official_api_documentation": 'http://api.artic.edu/docs/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['images'] -paging = True -nb_per_page = 20 - -search_api = 'https://api.artic.edu/api/v1/artworks/search?' -image_api = 'https://www.artic.edu/iiif/2/' - - -def request(query, params): - - args = urlencode( - { - 'q': query, - 'page': params['pageno'], - 'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles', - 'limit': nb_per_page, - } - ) - params['url'] = search_api + args - - logger.debug("query_url --> %s", params['url']) - return params - - -def response(resp): - - results = [] - json_data = loads(resp.text) - - for result in json_data['data']: - - if not result['image_id']: - continue - - results.append( - { - 'url': 'https://artic.edu/artworks/%(id)s' % result, - 'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result, - 'content': result['medium_display'], - 'author': ', '.join(result['artist_titles']), - 'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result, - 'img_format': result['dimensions'], - 'template': 'images.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/arxiv.py b/apps/searxng/searx/engines/arxiv.py deleted file mode 100755 index a4811eb..0000000 --- a/apps/searxng/searx/engines/arxiv.py +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - ArXiV (Scientific preprints) -""" - -from lxml import etree -from lxml.etree import XPath -from datetime import datetime -from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex - -# about -about = { - "website": 'https://arxiv.org', - "wikidata_id": 'Q118398', - "official_api_documentation": 'https://arxiv.org/help/api', - "use_official_api": True, - "require_api_key": False, - "results": 'XML-RSS', -} - -categories = ['science', 'scientific publications'] -paging = True - -base_url = ( - 'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}' -) - -# engine dependent config -number_of_results = 10 - -# xpaths -arxiv_namespaces = { - "atom": "http://www.w3.org/2005/Atom", - "arxiv": "http://arxiv.org/schemas/atom", -} -xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces) -xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces) -xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces) -xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces) -xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces) -xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces) -xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces) -xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces) -xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces) -xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces) -xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces) - - -def request(query, params): - # basic search - offset = (params['pageno'] - 1) * number_of_results - - string_args = dict(query=query, offset=offset, number_of_results=number_of_results) - - params['url'] = base_url.format(**string_args) - - return params - - -def response(resp): - results = [] - dom = etree.fromstring(resp.content) - for entry in eval_xpath_list(dom, xpath_entry): - title = eval_xpath_getindex(entry, xpath_title, 0).text - - url = eval_xpath_getindex(entry, xpath_id, 0).text - abstract = eval_xpath_getindex(entry, xpath_summary, 0).text - - authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)] - - # doi - doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None) - doi = None if doi_element is None else doi_element.text - - # pdf - pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None) - pdf_url = None if pdf_element is None else pdf_element.attrib.get('href') - - # journal - journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None) - journal = None if journal_element is None else journal_element.text - - # tags - tag_elements = eval_xpath(entry, xpath_category) - tags = [str(tag) for tag in tag_elements] - - # comments - comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None) - comments = None if comments_elements is None else comments_elements.text - - publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ') - - res_dict = { - 'template': 'paper.html', - 'url': url, - 'title': title, - 'publishedDate': publishedDate, - 'content': abstract, - 'doi': doi, - 'authors': authors, - 'journal': journal, - 'tags': tags, - 'comments': comments, - 'pdf_url': pdf_url, - } - - results.append(res_dict) - - return results diff --git a/apps/searxng/searx/engines/bandcamp.py b/apps/searxng/searx/engines/bandcamp.py deleted file mode 100755 index 8feff1f..0000000 --- a/apps/searxng/searx/engines/bandcamp.py +++ /dev/null @@ -1,95 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Bandcamp (Music) - -@website https://bandcamp.com/ -@provide-api no -@results HTML -@parse url, title, content, publishedDate, iframe_src, thumbnail - -""" - -from urllib.parse import urlencode, urlparse, parse_qs -from dateutil.parser import parse as dateparse -from lxml import html - -from searx.utils import ( - eval_xpath_getindex, - eval_xpath_list, - extract_text, -) - -# about -about = { - "website": 'https://bandcamp.com/', - "wikidata_id": 'Q545966', - "official_api_documentation": 'https://bandcamp.com/developer', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -categories = ['music'] -paging = True - -base_url = "https://bandcamp.com/" -search_string = 'search?{query}&page={page}' -iframe_src = "https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=000/linkcol=fff/artwork=small" - - -def request(query, params): - '''pre-request callback - - params: - method : POST/GET - headers : {} - data : {} # if method == POST - url : '' - category: 'search category' - pageno : 1 # number of the requested page - ''' - - search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno']) - params['url'] = base_url + search_path - return params - - -def response(resp): - '''post-response callback - - resp: requests response object - ''' - results = [] - dom = html.fromstring(resp.text) - - for result in eval_xpath_list(dom, '//li[contains(@class, "searchresult")]'): - - link = eval_xpath_getindex(result, './/div[@class="itemurl"]/a', 0, default=None) - if link is None: - continue - - title = result.xpath('.//div[@class="heading"]/a/text()') - content = result.xpath('.//div[@class="subhead"]/text()') - new_result = { - "url": extract_text(link), - "title": extract_text(title), - "content": extract_text(content), - } - - date = eval_xpath_getindex(result, '//div[@class="released"]/text()', 0, default=None) - if date: - new_result["publishedDate"] = dateparse(date.replace("released ", "")) - - thumbnail = result.xpath('.//div[@class="art"]/img/@src') - if thumbnail: - new_result['img_src'] = thumbnail[0] - - result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0] - itemtype = extract_text(result.xpath('.//div[@class="itemtype"]')).lower() - if "album" == itemtype: - new_result["iframe_src"] = iframe_src.format(type='album', result_id=result_id) - elif "track" == itemtype: - new_result["iframe_src"] = iframe_src.format(type='track', result_id=result_id) - - results.append(new_result) - return results diff --git a/apps/searxng/searx/engines/base.py b/apps/searxng/searx/engines/base.py deleted file mode 100755 index 5a2d666..0000000 --- a/apps/searxng/searx/engines/base.py +++ /dev/null @@ -1,112 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - BASE (Scholar publications) -""" - -from urllib.parse import urlencode -from lxml import etree -from datetime import datetime -import re -from searx.utils import searx_useragent - -# about -about = { - "website": 'https://base-search.net', - "wikidata_id": 'Q448335', - "official_api_documentation": 'https://api.base-search.net/', - "use_official_api": True, - "require_api_key": False, - "results": 'XML', -} - -categories = ['science'] - -base_url = ( - 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi' - + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}' -) - -# engine dependent config -paging = True -number_of_results = 10 - -# shortcuts for advanced search -shorcut_dict = { - # user-friendly keywords - 'format:': 'dcformat:', - 'author:': 'dccreator:', - 'collection:': 'dccollection:', - 'hdate:': 'dchdate:', - 'contributor:': 'dccontributor:', - 'coverage:': 'dccoverage:', - 'date:': 'dcdate:', - 'abstract:': 'dcdescription:', - 'urls:': 'dcidentifier:', - 'language:': 'dclanguage:', - 'publisher:': 'dcpublisher:', - 'relation:': 'dcrelation:', - 'rights:': 'dcrights:', - 'source:': 'dcsource:', - 'subject:': 'dcsubject:', - 'title:': 'dctitle:', - 'type:': 'dcdctype:', -} - - -def request(query, params): - # replace shortcuts with API advanced search keywords - for key in shorcut_dict.keys(): - query = re.sub(key, shorcut_dict[key], query) - - # basic search - offset = (params['pageno'] - 1) * number_of_results - - string_args = dict(query=urlencode({'query': query}), offset=offset, hits=number_of_results) - - params['url'] = base_url.format(**string_args) - - params['headers']['User-Agent'] = searx_useragent() - return params - - -def response(resp): - results = [] - - search_results = etree.XML(resp.content) - - for entry in search_results.xpath('./result/doc'): - content = "No description available" - - date = datetime.now() # needed in case no dcdate is available for an item - for item in entry: - if item.attrib["name"] == "dcdate": - date = item.text - - elif item.attrib["name"] == "dctitle": - title = item.text - - elif item.attrib["name"] == "dclink": - url = item.text - - elif item.attrib["name"] == "dcdescription": - content = item.text[:300] - if len(item.text) > 300: - content += "..." - - # dates returned by the BASE API are not several formats - publishedDate = None - for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']: - try: - publishedDate = datetime.strptime(date, date_format) - break - except: - pass - - if publishedDate is not None: - res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content} - else: - res_dict = {'url': url, 'title': title, 'content': content} - - results.append(res_dict) - - return results diff --git a/apps/searxng/searx/engines/bing.py b/apps/searxng/searx/engines/bing.py deleted file mode 100755 index 3cd7078..0000000 --- a/apps/searxng/searx/engines/bing.py +++ /dev/null @@ -1,337 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This is the implementation of the Bing-WEB engine. Some of this -implementations are shared by other engines: - -- :ref:`bing images engine` -- :ref:`bing news engine` -- :ref:`bing videos engine` - -On the `preference page`_ Bing offers a lot of languages an regions (see section -'Search results languages' and 'Country/region'). However, the abundant choice -does not correspond to reality, where Bing has a full-text indexer only for a -limited number of languages. By example: you can select a language like Māori -but you never get a result in this language. - -What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem -to be completely correct either (if you take a closer look you will find some -inaccuracies there too): - -- :py:obj:`searx.engines.bing.bing_traits_url` -- :py:obj:`searx.engines.bing_videos.bing_traits_url` -- :py:obj:`searx.engines.bing_images.bing_traits_url` -- :py:obj:`searx.engines.bing_news.bing_traits_url` - -.. _preference page: https://www.bing.com/account/general -.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/ - -""" -# pylint: disable=too-many-branches, invalid-name - -from typing import TYPE_CHECKING -import datetime -import re -import uuid -from urllib.parse import urlencode -from lxml import html -import babel -import babel.languages - -from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex -from searx.locales import language_tag, region_tag -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -about = { - "website": 'https://www.bing.com', - "wikidata_id": 'Q182496', - "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -send_accept_language_header = True -"""Bing tries to guess user's language and territory from the HTTP -Accept-Language. Optional the user can select a search-language (can be -different to the UI language) and a region (market code).""" - -# engine dependent config -categories = ['general', 'web'] -paging = True -time_range_support = True -safesearch = True -safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT - -base_url = 'https://www.bing.com/search' -"""Bing (Web) search URL""" - -bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes' -"""Bing (Web) search API description""" - - -def _get_offset_from_pageno(pageno): - return (pageno - 1) * 10 + 1 - - -def set_bing_cookies(params, engine_language, engine_region, SID): - - # set cookies - # ----------- - - params['cookies']['_EDGE_V'] = '1' - - # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw - _EDGE_S = [ - 'F=1', - 'SID=%s' % SID, - 'mkt=%s' % engine_region.lower(), - 'ui=%s' % engine_language.lower(), - ] - params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S) - logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S']) - - # "_EDGE_CD": "m=zh-tw", - - _EDGE_CD = [ # pylint: disable=invalid-name - 'm=%s' % engine_region.lower(), # search region: zh-cn - 'u=%s' % engine_language.lower(), # UI: en-us - ] - - params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';' - logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD']) - - SRCHHPGUSR = [ # pylint: disable=invalid-name - 'SRCHLANG=%s' % engine_language, - # Trying to set ADLT cookie here seems not to have any effect, I assume - # there is some age verification by a cookie (and/or session ID) needed, - # to disable the SafeSearch. - 'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'), - ] - params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR) - logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR']) - - -def request(query, params): - """Assemble a Bing-Web request.""" - - engine_region = traits.get_region(params['searxng_locale'], 'en-US') - engine_language = traits.get_language(params['searxng_locale'], 'en') - - SID = uuid.uuid1().hex.upper() - CVID = uuid.uuid1().hex.upper() - - set_bing_cookies(params, engine_language, engine_region, SID) - - # build URL query - # --------------- - - # query term - page = int(params.get('pageno', 1)) - query_params = { - # fmt: off - 'q': query, - 'pq': query, - 'cvid': CVID, - 'qs': 'n', - 'sp': '-1' - # fmt: on - } - - # page - if page > 1: - referer = base_url + '?' + urlencode(query_params) - params['headers']['Referer'] = referer - logger.debug("headers.Referer --> %s", referer) - - query_params['first'] = _get_offset_from_pageno(page) - - if page == 2: - query_params['FORM'] = 'PERE' - elif page > 2: - query_params['FORM'] = 'PERE%s' % (page - 2) - - filters = '' - if params['time_range']: - query_params['filt'] = 'custom' - - if params['time_range'] == 'day': - filters = 'ex1:"ez1"' - elif params['time_range'] == 'week': - filters = 'ex1:"ez2"' - elif params['time_range'] == 'month': - filters = 'ex1:"ez3"' - elif params['time_range'] == 'year': - epoch_1970 = datetime.date(1970, 1, 1) - today_no = (datetime.date.today() - epoch_1970).days - filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no) - - params['url'] = base_url + '?' + urlencode(query_params) - if filters: - params['url'] = params['url'] + '&filters=' + filters - return params - - -def response(resp): - # pylint: disable=too-many-locals,import-outside-toplevel - - from searx.network import Request, multi_requests # see https://github.com/searxng/searxng/issues/762 - - results = [] - result_len = 0 - - dom = html.fromstring(resp.text) - - # parse results again if nothing is found yet - - url_to_resolve = [] - url_to_resolve_index = [] - i = 0 - for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'): - - link = eval_xpath_getindex(result, './/h2/a', 0, None) - if link is None: - continue - url = link.attrib.get('href') - title = extract_text(link) - - content = eval_xpath(result, '(.//p)[1]') - for p in content: - # Make sure that the element is free of links - for e in p.xpath('.//a'): - e.getparent().remove(e) - content = extract_text(content) - - # get the real URL either using the URL shown to user or following the Bing URL - if url.startswith('https://www.bing.com/ck/a?'): - url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) - # Bing can shorten the URL either at the end or in the middle of the string - if ( - url_cite - and url_cite.startswith('https://') - and '…' not in url_cite - and '...' not in url_cite - and '›' not in url_cite - ): - # no need for an additional HTTP request - url = url_cite - else: - # resolve the URL with an additional HTTP request - url_to_resolve.append(url.replace('&ntb=1', '&ntb=F')) - url_to_resolve_index.append(i) - url = None # remove the result if the HTTP Bing redirect raise an exception - - # append result - results.append({'url': url, 'title': title, 'content': content}) - # increment result pointer for the next iteration in this loop - i += 1 - - # resolve all Bing redirections in parallel - request_list = [ - Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve - ] - response_list = multi_requests(request_list) - for i, redirect_response in enumerate(response_list): - if not isinstance(redirect_response, Exception): - results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] - - # get number_of_results - try: - result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) - if "-" in result_len_container: - - # Remove the part "from-to" for paginated request ... - result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :] - - result_len_container = re.sub('[^0-9]', '', result_len_container) - - if len(result_len_container) > 0: - result_len = int(result_len_container) - - except Exception as e: # pylint: disable=broad-except - logger.debug('result error :\n%s', e) - - if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: - return [] - - results.append({'number_of_results': result_len}) - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages and regions from Bing-Web.""" - - xpath_market_codes = '//table[1]/tbody/tr/td[3]' - # xpath_country_codes = '//table[2]/tbody/tr/td[2]' - xpath_language_codes = '//table[3]/tbody/tr/td[2]' - - _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) - - -def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str): - # pylint: disable=too-many-locals,import-outside-toplevel - - from searx.network import get # see https://github.com/searxng/searxng/issues/762 - - # insert alias to map from a language (zh) to a language + script (zh_Hans) - engine_traits.languages['zh'] = 'zh-hans' - - resp = get(url) - - if not resp.ok: # type: ignore - print("ERROR: response from peertube is not OK.") - - dom = html.fromstring(resp.text) # type: ignore - - map_lang = {'jp': 'ja'} - for td in eval_xpath(dom, xpath_language_codes): - eng_lang = td.text - - if eng_lang in ('en-gb', 'pt-br'): - # language 'en' is already in the list and a language 'en-gb' can't - # be handled in SearXNG, same with pt-br which is covered by pt-pt. - continue - - babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_') - try: - sxng_tag = language_tag(babel.Locale.parse(babel_lang)) - except babel.UnknownLocaleError: - print("ERROR: language (%s) is unknown by babel" % (eng_lang)) - continue - conflict = engine_traits.languages.get(sxng_tag) - if conflict: - if conflict != eng_lang: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) - continue - engine_traits.languages[sxng_tag] = eng_lang - - map_region = { - 'en-ID': 'id_ID', - 'no-NO': 'nb_NO', - } - - for td in eval_xpath(dom, xpath_market_codes): - eng_region = td.text - babel_region = map_region.get(eng_region, eng_region).replace('-', '_') - - if eng_region == 'en-WW': - engine_traits.all_locale = eng_region - continue - - try: - sxng_tag = region_tag(babel.Locale.parse(babel_region)) - except babel.UnknownLocaleError: - print("ERROR: region (%s) is unknown by babel" % (eng_region)) - continue - conflict = engine_traits.regions.get(sxng_tag) - if conflict: - if conflict != eng_region: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region)) - continue - engine_traits.regions[sxng_tag] = eng_region diff --git a/apps/searxng/searx/engines/bing_images.py b/apps/searxng/searx/engines/bing_images.py deleted file mode 100755 index bd3a34a..0000000 --- a/apps/searxng/searx/engines/bing_images.py +++ /dev/null @@ -1,132 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Bing-Images: description see :py:obj:`searx.engines.bing`. -""" -# pylint: disable=invalid-name - - -from typing import TYPE_CHECKING -import uuid -import json -from urllib.parse import urlencode - -from lxml import html - -from searx.enginelib.traits import EngineTraits -from searx.engines.bing import ( - set_bing_cookies, - _fetch_traits, -) -from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -# about -about = { - "website": 'https://www.bing.com/images', - "wikidata_id": 'Q182496', - "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-image-search-api', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['images', 'web'] -paging = True -safesearch = True -time_range_support = True - -base_url = 'https://www.bing.com/images/async' -"""Bing (Images) search URL""" - -bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes' -"""Bing (Images) search API description""" - -time_map = { - # fmt: off - 'day': 60 * 24, - 'week': 60 * 24 * 7, - 'month': 60 * 24 * 31, - 'year': 60 * 24 * 365, - # fmt: on -} - - -def request(query, params): - """Assemble a Bing-Image request.""" - - engine_region = traits.get_region(params['searxng_locale'], 'en-US') - engine_language = traits.get_language(params['searxng_locale'], 'en') - - SID = uuid.uuid1().hex.upper() - set_bing_cookies(params, engine_language, engine_region, SID) - - # build URL query - # - example: https://www.bing.com/images/async?q=foo&first=155&count=35 - - query_params = { - # fmt: off - 'q': query, - 'async' : 'content', - # to simplify the page count lets use the default of 35 images per page - 'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1, - 'count' : 35, - # fmt: on - } - - # time range - # - example: one year (525600 minutes) 'qft=+filterui:age-lt525600' - - if params['time_range']: - query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']] - - params['url'] = base_url + '?' + urlencode(query_params) - - return params - - -def response(resp): - """Get response from Bing-Images""" - - results = [] - dom = html.fromstring(resp.text) - - for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'): - - metadata = result.xpath('.//a[@class="iusc"]/@m') - if not metadata: - continue - - metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0]) - title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip() - img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip() - source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip() - results.append( - { - 'template': 'images.html', - 'url': metadata['purl'], - 'thumbnail_src': metadata['turl'], - 'img_src': metadata['murl'], - 'content': metadata['desc'], - 'title': title, - 'source': source, - 'img_format': img_format, - } - ) - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages and regions from Bing-News.""" - - xpath_market_codes = '//table[1]/tbody/tr/td[3]' - # xpath_country_codes = '//table[2]/tbody/tr/td[2]' - xpath_language_codes = '//table[3]/tbody/tr/td[2]' - - _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) diff --git a/apps/searxng/searx/engines/bing_news.py b/apps/searxng/searx/engines/bing_news.py deleted file mode 100755 index d8c6385..0000000 --- a/apps/searxng/searx/engines/bing_news.py +++ /dev/null @@ -1,150 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Bing-News: description see :py:obj:`searx.engines.bing`. -""" - -# pylint: disable=invalid-name - -from typing import TYPE_CHECKING -import uuid -from urllib.parse import urlencode - -from lxml import html - -from searx.enginelib.traits import EngineTraits -from searx.engines.bing import ( - set_bing_cookies, - _fetch_traits, -) -from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - - -# about -about = { - "website": 'https://www.bing.com/news', - "wikidata_id": 'Q2878637', - "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api', - "use_official_api": False, - "require_api_key": False, - "results": 'RSS', -} - -# engine dependent config -categories = ['news'] -paging = True -time_range_support = True -time_map = { - 'day': '4', - 'week': '8', - 'month': '9', -} -"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the -difference of *last day* and *last week* in the result list is just marginally. -""" - -base_url = 'https://www.bing.com/news/infinitescrollajax' -"""Bing (News) search URL""" - -bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes' -"""Bing (News) search API description""" - -mkt_alias = { - 'zh': 'en-WW', - 'zh-CN': 'en-WW', -} -"""Bing News has an official market code 'zh-CN' but we won't get a result with -this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate* -market code (en-WW). -""" - - -def request(query, params): - """Assemble a Bing-News request.""" - - sxng_locale = params['searxng_locale'] - engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale) - engine_language = traits.get_language(sxng_locale, 'en') - - SID = uuid.uuid1().hex.upper() - set_bing_cookies(params, engine_language, engine_region, SID) - - # build URL query - # - # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1 - - query_params = { - # fmt: off - 'q': query, - 'InfiniteScroll': 1, - # to simplify the page count lets use the default of 10 images per page - 'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1, - # fmt: on - } - - if params['time_range']: - # qft=interval:"7" - query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9') - - params['url'] = base_url + '?' + urlencode(query_params) - - return params - - -def response(resp): - """Get response from Bing-Video""" - results = [] - - if not resp.ok or not resp.text: - return results - - dom = html.fromstring(resp.text) - - for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'): - - url = newsitem.xpath('./@url')[0] - title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip() - content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip() - thumbnail = None - author = newsitem.xpath('./@data-author')[0] - metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip() - - img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src') - if img_src: - thumbnail = 'https://www.bing.com/' + img_src[0] - - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'img_src': thumbnail, - 'author': author, - 'metadata': metadata, - } - ) - - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages and regions from Bing-News. - - The :py:obj:`description ` of the - first table says *"query parameter when calling the Video Search API."* - .. thats why I use the 4. table "News Category API markets" for the - ``xpath_market_codes``. - - """ - - xpath_market_codes = '//table[4]/tbody/tr/td[3]' - # xpath_country_codes = '//table[2]/tbody/tr/td[2]' - xpath_language_codes = '//table[3]/tbody/tr/td[2]' - - _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) diff --git a/apps/searxng/searx/engines/bing_videos.py b/apps/searxng/searx/engines/bing_videos.py deleted file mode 100755 index 8ee0bb6..0000000 --- a/apps/searxng/searx/engines/bing_videos.py +++ /dev/null @@ -1,128 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Bing-Videos: description see :py:obj:`searx.engines.bing`. -""" -# pylint: disable=invalid-name - -from typing import TYPE_CHECKING -import uuid -import json -from urllib.parse import urlencode - -from lxml import html - -from searx.enginelib.traits import EngineTraits -from searx.engines.bing import ( - set_bing_cookies, - _fetch_traits, -) -from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - - -about = { - "website": 'https://www.bing.com/videos', - "wikidata_id": 'Q4914152', - "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-video-search-api', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['videos', 'web'] -paging = True -safesearch = True -time_range_support = True - -base_url = 'https://www.bing.com/videos/asyncv2' -"""Bing (Videos) async search URL.""" - -bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes' -"""Bing (Video) search API description""" - -time_map = { - # fmt: off - 'day': 60 * 24, - 'week': 60 * 24 * 7, - 'month': 60 * 24 * 31, - 'year': 60 * 24 * 365, - # fmt: on -} - - -def request(query, params): - """Assemble a Bing-Video request.""" - - engine_region = traits.get_region(params['searxng_locale'], 'en-US') - engine_language = traits.get_language(params['searxng_locale'], 'en') - - SID = uuid.uuid1().hex.upper() - set_bing_cookies(params, engine_language, engine_region, SID) - - # build URL query - # - # example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35 - - query_params = { - # fmt: off - 'q': query, - 'async' : 'content', - # to simplify the page count lets use the default of 35 images per page - 'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1, - 'count' : 35, - # fmt: on - } - - # time range - # - # example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR' - - if params['time_range']: - query_params['form'] = 'VRFLTR' - query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']] - - params['url'] = base_url + '?' + urlencode(query_params) - - return params - - -def response(resp): - """Get response from Bing-Video""" - results = [] - - dom = html.fromstring(resp.text) - - for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'): - metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) - info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip() - content = '{0} - {1}'.format(metadata['du'], info) - thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0] - - results.append( - { - 'url': metadata['murl'], - 'thumbnail': thumbnail, - 'title': metadata.get('vt', ''), - 'content': content, - 'template': 'videos.html', - } - ) - - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages and regions from Bing-Videos.""" - - xpath_market_codes = '//table[1]/tbody/tr/td[3]' - # xpath_country_codes = '//table[2]/tbody/tr/td[2]' - xpath_language_codes = '//table[3]/tbody/tr/td[2]' - - _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) diff --git a/apps/searxng/searx/engines/brave.py b/apps/searxng/searx/engines/brave.py deleted file mode 100755 index f455992..0000000 --- a/apps/searxng/searx/engines/brave.py +++ /dev/null @@ -1,419 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Brave supports the categories listed in :py:obj:`brave_category` (General, -news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range -` is limited (see remarks). - -Configured ``brave`` engines: - -.. code:: yaml - - - name: brave - engine: brave - ... - brave_category: search - time_range_support: true - paging: true - - - name: brave.images - engine: brave - ... - brave_category: images - - - name: brave.videos - engine: brave - ... - brave_category: videos - - - name: brave.news - engine: brave - ... - brave_category: news - - -.. _brave regions: - -Brave regions -============= - -Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with -locales. To get a mapping, all *officatl de-facto* languages of the Brave -region are mapped to regions in SearXNG (see :py:obj:`babel -`): - -.. code:: python - - "regions": { - .. - "en-CA": "ca", - "fr-CA": "ca", - .. - } - - -.. note:: - - The language (aka region) support of Brave's index is limited to very basic - languages. The search results for languages like Chinese or Arabic are of - low quality. - - -.. _brave languages: - -Brave languages -=============== - -Brave's language support is limited to the UI (menues, area local notations, -etc). Brave's index only seems to support a locale, but it does not seem to -support any languages in its index. The choice of available languages is very -small (and its not clear to me where the differencee in UI is when switching -from en-us to en-ca or en-gb). - -In the :py:obj:`EngineTraits object ` the -UI languages are stored in a custom field named ``ui_lang``: - -.. code:: python - - "custom": { - "ui_lang": { - "ca": "ca", - "de-DE": "de-de", - "en-CA": "en-ca", - "en-GB": "en-gb", - "en-US": "en-us", - "es": "es", - "fr-CA": "fr-ca", - "fr-FR": "fr-fr", - "ja-JP": "ja-jp", - "pt-BR": "pt-br", - "sq-AL": "sq-al" - } - }, - -Implementations -=============== - -""" - -from typing import TYPE_CHECKING - -import re -from urllib.parse import ( - urlencode, - urlparse, - parse_qs, -) - -import chompjs -from lxml import html - -from searx import locales -from searx.utils import ( - extract_text, - eval_xpath_list, - eval_xpath_getindex, -) -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -about = { - "website": 'https://search.brave.com/', - "wikidata_id": 'Q22906900', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -base_url = "https://search.brave.com/" -categories = [] -brave_category = 'search' -"""Brave supports common web-search, video search, image and video search. - -- ``search``: Common WEB search -- ``videos``: search for videos -- ``images``: search for images -- ``news``: search for news -""" - -brave_spellcheck = False -"""Brave supports some kind of spell checking. When activated, Brave tries to -fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In -the UI of Brave the user gets warned about this, since we can not warn the user -in SearXNG, the spellchecking is disabled by default. -""" - -send_accept_language_header = True -paging = False -"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI -category All).""" - -safesearch = True -safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off - -time_range_support = False -"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI -category All).""" - -time_range_map = { - 'day': 'pd', - 'week': 'pw', - 'month': 'pm', - 'year': 'py', -} - - -def request(query, params): - - # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787 - params['headers']['Accept-Encoding'] = 'gzip, deflate' - - args = { - 'q': query, - } - if brave_spellcheck: - args['spellcheck'] = '1' - - if brave_category == 'search': - if params.get('pageno', 1) - 1: - args['offset'] = params.get('pageno', 1) - 1 - if time_range_map.get(params['time_range']): - args['tf'] = time_range_map.get(params['time_range']) - - params["url"] = f"{base_url}{brave_category}?{urlencode(args)}" - - # set properties in the cookies - - params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off') - # the useLocation is IP based, we use cookie 'country' for the region - params['cookies']['useLocation'] = '0' - params['cookies']['summarizer'] = '0' - - engine_region = traits.get_region(params['searxng_locale'], 'all') - params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore - - ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us') - params['cookies']['ui_lang'] = ui_lang - - logger.debug("cookies %s", params['cookies']) - - -def response(resp): - - if brave_category == 'search': - return _parse_search(resp) - - datastr = "" - for line in resp.text.split("\n"): - if "const data = " in line: - datastr = line.replace("const data = ", "").strip()[:-1] - break - - json_data = chompjs.parse_js_object(datastr) - json_resp = json_data[1]['data']['body']['response'] - - if brave_category == 'news': - json_resp = json_resp['news'] - return _parse_news(json_resp) - - if brave_category == 'images': - return _parse_images(json_resp) - if brave_category == 'videos': - return _parse_videos(json_resp) - - raise ValueError(f"Unsupported brave category: {brave_category}") - - -def _parse_search(resp): - - result_list = [] - dom = html.fromstring(resp.text) - - answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None) - if answer_tag: - result_list.append({'answer': extract_text(answer_tag)}) - - # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]' - xpath_results = '//div[contains(@class, "snippet")]' - - for result in eval_xpath_list(dom, xpath_results): - - url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None) - title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None) - if not (url and title_tag): - continue - - content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='') - img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='') - - item = { - 'url': url, - 'title': extract_text(title_tag), - 'content': extract_text(content_tag), - 'img_src': img_src, - } - - video_tag = eval_xpath_getindex( - result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None - ) - if video_tag is not None: - - # In my tests a video tag in the WEB search was mostoften not a - # video, except the ones from youtube .. - - iframe_src = _get_iframe_src(url) - if iframe_src: - item['iframe_src'] = iframe_src - item['template'] = 'videos.html' - item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='') - else: - item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='') - - result_list.append(item) - - return result_list - - -def _get_iframe_src(url): - parsed_url = urlparse(url) - if parsed_url.path == '/watch' and parsed_url.query: - video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore - if video_id: - return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore - return None - - -def _parse_news(json_resp): - result_list = [] - - for result in json_resp["results"]: - item = { - 'url': result['url'], - 'title': result['title'], - 'content': result['description'], - } - if result['thumbnail'] != "null": - item['img_src'] = result['thumbnail']['src'] - result_list.append(item) - - return result_list - - -def _parse_images(json_resp): - result_list = [] - - for result in json_resp["results"]: - item = { - 'url': result['url'], - 'title': result['title'], - 'content': result['description'], - 'template': 'images.html', - 'img_format': result['properties']['format'], - 'source': result['source'], - 'img_src': result['properties']['url'], - } - result_list.append(item) - - return result_list - - -def _parse_videos(json_resp): - result_list = [] - - for result in json_resp["results"]: - - url = result['url'] - item = { - 'url': url, - 'title': result['title'], - 'content': result['description'], - 'template': 'videos.html', - 'length': result['video']['duration'], - 'duration': result['video']['duration'], - } - - if result['thumbnail'] != "null": - item['thumbnail'] = result['thumbnail']['src'] - - iframe_src = _get_iframe_src(url) - if iframe_src: - item['iframe_src'] = iframe_src - - result_list.append(item) - - return result_list - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch :ref:`languages ` and :ref:`regions ` from Brave.""" - - # pylint: disable=import-outside-toplevel - - import babel.languages - from searx.locales import region_tag, language_tag - from searx.network import get # see https://github.com/searxng/searxng/issues/762 - - engine_traits.custom["ui_lang"] = {} - - headers = { - 'Accept-Encoding': 'gzip, deflate', - } - lang_map = {'no': 'nb'} # norway - - # languages (UI) - - resp = get('https://search.brave.com/settings', headers=headers) - - if not resp.ok: # type: ignore - print("ERROR: response from Brave is not OK.") - dom = html.fromstring(resp.text) # type: ignore - - for option in dom.xpath('//div[@id="language-select"]//option'): - - ui_lang = option.get('value') - try: - if '-' in ui_lang: - sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-')) - else: - sxng_tag = language_tag(babel.Locale.parse(ui_lang)) - - except babel.UnknownLocaleError: - print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang) - continue - - conflict = engine_traits.custom["ui_lang"].get(sxng_tag) - if conflict: - if conflict != ui_lang: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang)) - continue - engine_traits.custom["ui_lang"][sxng_tag] = ui_lang - - # search regions of brave - - engine_traits.all_locale = 'all' - - for country in dom.xpath('//div[@id="sidebar"]//ul/li/div[contains(@class, "country")]'): - - flag = country.xpath('./span[contains(@class, "flag")]')[0] - # country_name = extract_text(flag.xpath('./following-sibling::*')[0]) - country_tag = re.search(r'flag-([^\s]*)\s', flag.xpath('./@class')[0]).group(1) # type: ignore - - # add offical languages of the country .. - for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True): - lang_tag = lang_map.get(lang_tag, lang_tag) - sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper()))) - # print("%-20s: %s <-- %s" % (country_name, country_tag, sxng_tag)) - - conflict = engine_traits.regions.get(sxng_tag) - if conflict: - if conflict != country_tag: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag)) - continue - engine_traits.regions[sxng_tag] = country_tag diff --git a/apps/searxng/searx/engines/bt4g.py b/apps/searxng/searx/engines/bt4g.py deleted file mode 100755 index 34717ae..0000000 --- a/apps/searxng/searx/engines/bt4g.py +++ /dev/null @@ -1,124 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""BT4G_ (bt4g.com) is not a tracker and doesn't store any content and only -collects torrent metadata (such as file names and file sizes) and a magnet link -(torrent identifier). - -This engine does not parse the HTML page because there is an API in XML (RSS). -The RSS feed provides fewer data like amount of seeders/leechers and the files -in the torrent file. It's a tradeoff for a "stable" engine as the XML from RSS -content will change way less than the HTML page. - -.. _BT4G: https://bt4g.com/ - -Configuration -============= - -The engine has the following additional settings: - -- :py:obj:`bt4g_order_by` -- :py:obj:`bt4g_category` - -With this options a SearXNG maintainer is able to configure **additional** -engines for specific torrent searches. For example a engine to search only for -Movies and sort the result list by the count of seeders. - -.. code:: yaml - - - name: bt4g.movie - engine: bt4g - shortcut: bt4gv - categories: video - bt4g_order_by: seeders - bt4g_category: 'movie' - -Implementations -=============== - -""" - -import re -from datetime import datetime -from urllib.parse import quote - -from lxml import etree - -from searx.utils import get_torrent_size - -# about -about = { - "website": 'https://bt4gprx.com', - "use_official_api": False, - "require_api_key": False, - "results": 'XML', -} - -# engine dependent config -categories = ['files'] -paging = True -time_range_support = True - -# search-url -url = 'https://bt4gprx.com' -search_url = url + '/search?q={search_term}&orderby={order_by}&category={category}&p={pageno}&page=rss' -bt4g_order_by = 'relevance' -"""Result list can be ordered by ``relevance`` (default), ``size``, ``seeders`` -or ``time``. - -.. hint:: - - When *time_range* is activate, the results always orderd by ``time``. -""" - -bt4g_category = 'all' -"""BT$G offers categoies: ``all`` (default), ``audio``, ``movie``, ``doc``, -``app`` and `` other``. -""" - - -def request(query, params): - - order_by = bt4g_order_by - if params['time_range']: - order_by = 'time' - - params['url'] = search_url.format( - search_term=quote(query), - order_by=order_by, - category=bt4g_category, - pageno=params['pageno'], - ) - return params - - -def response(resp): - results = [] - - search_results = etree.XML(resp.content) - - # return empty array if nothing is found - if len(search_results) == 0: - return [] - - for entry in search_results.xpath('./channel/item'): - title = entry.find("title").text - link = entry.find("guid").text - fullDescription = entry.find("description").text.split('
') - filesize = fullDescription[1] - filesizeParsed = re.split(r"([A-Z]+)", filesize) - magnetlink = entry.find("link").text - pubDate = entry.find("pubDate").text - results.append( - { - 'url': link, - 'title': title, - 'magnetlink': magnetlink, - 'seed': 'N/A', - 'leech': 'N/A', - 'filesize': get_torrent_size(filesizeParsed[0], filesizeParsed[1]), - 'publishedDate': datetime.strptime(pubDate, '%a,%d %b %Y %H:%M:%S %z'), - 'template': 'torrent.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/btdigg.py b/apps/searxng/searx/engines/btdigg.py deleted file mode 100755 index c5dd921..0000000 --- a/apps/searxng/searx/engines/btdigg.py +++ /dev/null @@ -1,89 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - BTDigg (Videos, Music, Files) -""" - -from lxml import html -from urllib.parse import quote, urljoin -from searx.utils import extract_text, get_torrent_size - -# about -about = { - "website": 'https://btdig.com', - "wikidata_id": 'Q4836698', - "official_api_documentation": {'url': 'https://btdig.com/contacts', 'comment': 'on demand'}, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['files'] -paging = True - -# search-url -url = 'https://btdig.com' -search_url = url + '/search?q={search_term}&p={pageno}' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'] - 1) - - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - search_res = dom.xpath('//div[@class="one_result"]') - - # return empty array if nothing is found - if not search_res: - return [] - - # parse results - for result in search_res: - link = result.xpath('.//div[@class="torrent_name"]//a')[0] - href = urljoin(url, link.attrib.get('href')) - title = extract_text(link) - - excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0] - content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False) - # it is better to emit
instead of |, but html tags are verboten - content = content.strip().replace('\n', ' | ') - content = ' '.join(content.split()) - - filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[0] - filesize_multiplier = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[1] - files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0] - - # convert filesize to byte if possible - filesize = get_torrent_size(filesize, filesize_multiplier) - - # convert files to int if possible - try: - files = int(files) - except: - files = None - - magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href'] - - # append result - results.append( - { - 'url': href, - 'title': title, - 'content': content, - 'filesize': filesize, - 'files': files, - 'magnetlink': magnetlink, - 'template': 'torrent.html', - } - ) - - # return results sorted by seeder - return results diff --git a/apps/searxng/searx/engines/command.py b/apps/searxng/searx/engines/command.py deleted file mode 100755 index ffb8750..0000000 --- a/apps/searxng/searx/engines/command.py +++ /dev/null @@ -1,243 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -"""With *command engines* administrators can run engines to integrate arbitrary -shell commands. - -.. attention:: - - When creating and enabling a ``command`` engine on a public instance, you - must be careful to avoid leaking private data. - -The easiest solution is to limit the access by setting ``tokens`` as described -in section :ref:`private engines`. The engine base is flexible. Only your -imagination can limit the power of this engine (and maybe security concerns). - -Configuration -============= - -The following options are available: - -``command``: - A comma separated list of the elements of the command. A special token - ``{{QUERY}}`` tells where to put the search terms of the user. Example: - - .. code:: yaml - - ['ls', '-l', '-h', '{{QUERY}}'] - -``delimiter``: - A mapping containing a delimiter ``char`` and the *titles* of each element in - ``keys``. - -``parse_regex``: - A dict containing the regular expressions for each result key. - -``query_type``: - - The expected type of user search terms. Possible values: ``path`` and - ``enum``. - - ``path``: - Checks if the user provided path is inside the working directory. If not, - the query is not executed. - - ``enum``: - Is a list of allowed search terms. If the user submits something which is - not included in the list, the query returns an error. - -``query_enum``: - A list containing allowed search terms if ``query_type`` is set to ``enum``. - -``working_dir``: - The directory where the command has to be executed. Default: ``./``. - -``result_separator``: - The character that separates results. Default: ``\\n``. - -Example -======= - -The example engine below can be used to find files with a specific name in the -configured working directory: - -.. code:: yaml - - - name: find - engine: command - command: ['find', '.', '-name', '{{QUERY}}'] - query_type: path - shortcut: fnd - delimiter: - chars: ' ' - keys: ['line'] - -Implementations -=============== -""" - -import re -from os.path import expanduser, isabs, realpath, commonprefix -from shlex import split as shlex_split -from subprocess import Popen, PIPE -from threading import Thread - -from searx import logger - - -engine_type = 'offline' -paging = True -command = [] -delimiter = {} -parse_regex = {} -query_type = '' -query_enum = [] -environment_variables = {} -working_dir = realpath('.') -result_separator = '\n' -result_template = 'key-value.html' -timeout = 4.0 - -_command_logger = logger.getChild('command') -_compiled_parse_regex = {} - - -def init(engine_settings): - check_parsing_options(engine_settings) - - if 'command' not in engine_settings: - raise ValueError('engine command : missing configuration key: command') - - global command, working_dir, delimiter, parse_regex, environment_variables - - command = engine_settings['command'] - - if 'working_dir' in engine_settings: - working_dir = engine_settings['working_dir'] - if not isabs(engine_settings['working_dir']): - working_dir = realpath(working_dir) - - if 'parse_regex' in engine_settings: - parse_regex = engine_settings['parse_regex'] - for result_key, regex in parse_regex.items(): - _compiled_parse_regex[result_key] = re.compile(regex, flags=re.MULTILINE) - if 'delimiter' in engine_settings: - delimiter = engine_settings['delimiter'] - - if 'environment_variables' in engine_settings: - environment_variables = engine_settings['environment_variables'] - - -def search(query, params): - cmd = _get_command_to_run(query) - if not cmd: - return [] - - results = [] - reader_thread = Thread(target=_get_results_from_process, args=(results, cmd, params['pageno'])) - reader_thread.start() - reader_thread.join(timeout=timeout) - - return results - - -def _get_command_to_run(query): - params = shlex_split(query) - __check_query_params(params) - - cmd = [] - for c in command: - if c == '{{QUERY}}': - cmd.extend(params) - else: - cmd.append(c) - - return cmd - - -def _get_results_from_process(results, cmd, pageno): - leftover = '' - count = 0 - start, end = __get_results_limits(pageno) - with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process: - line = process.stdout.readline() - while line: - buf = leftover + line.decode('utf-8') - raw_results = buf.split(result_separator) - if raw_results[-1]: - leftover = raw_results[-1] - raw_results = raw_results[:-1] - - for raw_result in raw_results: - result = __parse_single_result(raw_result) - if result is None: - _command_logger.debug('skipped result:', raw_result) - continue - - if start <= count and count <= end: - result['template'] = result_template - results.append(result) - - count += 1 - if end < count: - return results - - line = process.stdout.readline() - - return_code = process.wait(timeout=timeout) - if return_code != 0: - raise RuntimeError('non-zero return code when running command', cmd, return_code) - - -def __get_results_limits(pageno): - start = (pageno - 1) * 10 - end = start + 9 - return start, end - - -def __check_query_params(params): - if not query_type: - return - - if query_type == 'path': - query_path = params[-1] - query_path = expanduser(query_path) - if commonprefix([realpath(query_path), working_dir]) != working_dir: - raise ValueError('requested path is outside of configured working directory') - elif query_type == 'enum' and len(query_enum) > 0: - for param in params: - if param not in query_enum: - raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum) - - -def check_parsing_options(engine_settings): - """Checks if delimiter based parsing or regex parsing is configured correctly""" - - if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings: - raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex') - if 'delimiter' in engine_settings and 'parse_regex' in engine_settings: - raise ValueError('failed to init settings for parsing lines: too many settings') - - if 'delimiter' in engine_settings: - if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']: - raise ValueError - - -def __parse_single_result(raw_result): - """Parses command line output based on configuration""" - - result = {} - - if delimiter: - elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1) - if len(elements) != len(delimiter['keys']): - return {} - for i in range(len(elements)): - result[delimiter['keys'][i]] = elements[i] - - if parse_regex: - for result_key, regex in _compiled_parse_regex.items(): - found = regex.search(raw_result) - if not found: - return {} - result[result_key] = raw_result[found.start() : found.end()] - - return result diff --git a/apps/searxng/searx/engines/core.py b/apps/searxng/searx/engines/core.py deleted file mode 100755 index 2fa66e2..0000000 --- a/apps/searxng/searx/engines/core.py +++ /dev/null @@ -1,116 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""CORE (science) - -""" - -from datetime import datetime -from urllib.parse import urlencode - -from searx.exceptions import SearxEngineAPIException - -about = { - "website": 'https://core.ac.uk', - "wikidata_id": 'Q22661180', - "official_api_documentation": 'https://core.ac.uk/documentation/api/', - "use_official_api": True, - "require_api_key": True, - "results": 'JSON', -} - -categories = ['science', 'scientific publications'] -paging = True -nb_per_page = 10 - -api_key = 'unset' - -base_url = 'https://core.ac.uk:443/api-v2/search/' -search_string = '{query}?page={page}&pageSize={nb_per_page}&apiKey={apikey}' - - -def request(query, params): - - if api_key == 'unset': - raise SearxEngineAPIException('missing CORE API key') - - search_path = search_string.format( - query=urlencode({'q': query}), - nb_per_page=nb_per_page, - page=params['pageno'], - apikey=api_key, - ) - params['url'] = base_url + search_path - - return params - - -def response(resp): - results = [] - json_data = resp.json() - - for result in json_data['data']: - source = result['_source'] - url = None - if source.get('urls'): - url = source['urls'][0].replace('http://', 'https://', 1) - - if url is None and source.get('doi'): - # use the DOI reference - url = 'https://doi.org/' + source['doi'] - - if url is None and source.get('downloadUrl'): - # use the downloadUrl - url = source['downloadUrl'] - - if url is None and source.get('identifiers'): - # try to find an ark id, see - # https://www.wikidata.org/wiki/Property:P8091 - # and https://en.wikipedia.org/wiki/Archival_Resource_Key - arkids = [ - identifier[5:] # 5 is the length of "ark:/" - for identifier in source.get('identifiers') - if isinstance(identifier, str) and identifier.startswith('ark:/') - ] - if len(arkids) > 0: - url = 'https://n2t.net/' + arkids[0] - - if url is None: - continue - - publishedDate = None - time = source['publishedDate'] or source['depositedDate'] - if time: - publishedDate = datetime.fromtimestamp(time / 1000) - - # sometimes the 'title' is None / filter None values - journals = [j['title'] for j in (source.get('journals') or []) if j['title']] - - publisher = source['publisher'] - if publisher: - publisher = source['publisher'].strip("'") - - results.append( - { - 'template': 'paper.html', - 'title': source['title'], - 'url': url, - 'content': source['description'] or '', - # 'comments': '', - 'tags': source['topics'], - 'publishedDate': publishedDate, - 'type': (source['types'] or [None])[0], - 'authors': source['authors'], - 'editor': ', '.join(source['contributors'] or []), - 'publisher': publisher, - 'journal': ', '.join(journals), - # 'volume': '', - # 'pages' : '', - # 'number': '', - 'doi': source['doi'], - 'issn': [x for x in [source.get('issn')] if x], - 'isbn': [x for x in [source.get('isbn')] if x], # exists in the rawRecordXml - 'pdf_url': source.get('repositoryDocument', {}).get('pdfOrigin'), - } - ) - - return results diff --git a/apps/searxng/searx/engines/crossref.py b/apps/searxng/searx/engines/crossref.py deleted file mode 100755 index e12a0da..0000000 --- a/apps/searxng/searx/engines/crossref.py +++ /dev/null @@ -1,60 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Semantic Scholar (Science) -""" -# pylint: disable=use-dict-literal - -from urllib.parse import urlencode -from searx.utils import html_to_text - -about = { - "website": 'https://www.crossref.org/', - "wikidata_id": 'Q5188229', - "official_api_documentation": 'https://github.com/CrossRef/rest-api-doc', - "use_official_api": False, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['science', 'scientific publications'] -paging = True -search_url = 'https://api.crossref.org/works' - - -def request(query, params): - params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1))) - return params - - -def response(resp): - res = resp.json() - results = [] - for record in res['message']['items']: - record_type = record['type'] - if record_type == 'book-chapter': - title = record['container-title'][0] - if record['title'][0].lower().strip() != title.lower().strip(): - title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')' - journal = None - else: - title = html_to_text(record['title'][0]) - journal = record.get('container-title', [None])[0] - url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL'] - authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])] - isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])] - results.append( - { - 'template': 'paper.html', - 'url': url, - 'title': title, - 'journal': journal, - 'volume': record.get('volume'), - 'type': record['type'], - 'content': html_to_text(record.get('abstract', '')), - 'publisher': record.get('publisher'), - 'authors': authors, - 'doi': record['DOI'], - 'isbn': isbn, - } - ) - return results diff --git a/apps/searxng/searx/engines/currency_convert.py b/apps/searxng/searx/engines/currency_convert.py deleted file mode 100755 index 18ea6cb..0000000 --- a/apps/searxng/searx/engines/currency_convert.py +++ /dev/null @@ -1,56 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Currency convert (DuckDuckGo) -""" - -import json - -# about -about = { - "website": 'https://duckduckgo.com/', - "wikidata_id": 'Q12805', - "official_api_documentation": 'https://duckduckgo.com/api', - "use_official_api": False, - "require_api_key": False, - "results": 'JSONP', - "description": "Service from DuckDuckGo.", -} - -engine_type = 'online_currency' -categories = [] -base_url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}' -weight = 100 - -https_support = True - - -def request(_query, params): - params['url'] = base_url.format(params['from'], params['to']) - return params - - -def response(resp): - """remove first and last lines to get only json""" - json_resp = resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2] - results = [] - try: - conversion_rate = float(json.loads(json_resp)['conversion']['converted-amount']) - except ValueError: - return results - answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format( - resp.search_params['amount'], - resp.search_params['from'], - resp.search_params['amount'] * conversion_rate, - resp.search_params['to'], - conversion_rate, - resp.search_params['from_name'], - resp.search_params['to_name'], - ) - - url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'.format( - resp.search_params['from'].upper(), resp.search_params['to'] - ) - - results.append({'answer': answer, 'url': url}) - - return results diff --git a/apps/searxng/searx/engines/dailymotion.py b/apps/searxng/searx/engines/dailymotion.py deleted file mode 100755 index 99da961..0000000 --- a/apps/searxng/searx/engines/dailymotion.py +++ /dev/null @@ -1,252 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Dailymotion (Videos) -~~~~~~~~~~~~~~~~~~~~ - -.. _REST GET: https://developers.dailymotion.com/tools/ -.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters -.. _Video filters API: https://developers.dailymotion.com/api/#video-filters -.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection - -""" - -from typing import TYPE_CHECKING - -from datetime import datetime, timedelta -from urllib.parse import urlencode -import time -import babel - -from searx.network import get, raise_for_httperror # see https://github.com/searxng/searxng/issues/762 -from searx.utils import html_to_text -from searx.exceptions import SearxEngineAPIException -from searx.locales import region_tag, language_tag -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -# about -about = { - "website": 'https://www.dailymotion.com', - "wikidata_id": 'Q769222', - "official_api_documentation": 'https://www.dailymotion.com/developer', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['videos'] -paging = True -number_of_results = 10 - -time_range_support = True -time_delta_dict = { - "day": timedelta(days=1), - "week": timedelta(days=7), - "month": timedelta(days=31), - "year": timedelta(days=365), -} - -safesearch = True -safesearch_params = { - 2: {'is_created_for_kids': 'true'}, - 1: {'is_created_for_kids': 'true'}, - 0: {}, -} -"""True if this video is "Created for Kids" / intends to target an audience -under the age of 16 (``is_created_for_kids`` in `Video filters API`_ ) -""" - -family_filter_map = { - 2: 'true', - 1: 'true', - 0: 'false', -} -"""By default, the family filter is turned on. Setting this parameter to -``false`` will stop filtering-out explicit content from searches and global -contexts (``family_filter`` in `Global API Parameters`_ ). -""" - -result_fields = [ - 'allow_embed', - 'description', - 'title', - 'created_time', - 'duration', - 'url', - 'thumbnail_360_url', - 'id', -] -"""`Fields selection`_, by default, a few fields are returned. To request more -specific fields, the ``fields`` parameter is used with the list of fields -SearXNG needs in the response to build a video result list. -""" - -search_url = 'https://api.dailymotion.com/videos?' -"""URL to retrieve a list of videos. - -- `REST GET`_ -- `Global API Parameters`_ -- `Video filters API`_ -""" - -iframe_src = "https://www.dailymotion.com/embed/video/{video_id}" -"""URL template to embed video in SearXNG's result list.""" - - -def request(query, params): - - if not query: - return False - - eng_region: str = traits.get_region(params['searxng_locale'], 'en_US') # type: ignore - eng_lang = traits.get_language(params['searxng_locale'], 'en') - - args = { - 'search': query, - 'family_filter': family_filter_map.get(params['safesearch'], 'false'), - 'thumbnail_ratio': 'original', # original|widescreen|square - # https://developers.dailymotion.com/api/#video-filters - 'languages': eng_lang, - 'page': params['pageno'], - 'password_protected': 'false', - 'private': 'false', - 'sort': 'relevance', - 'limit': number_of_results, - 'fields': ','.join(result_fields), - } - - args.update(safesearch_params.get(params['safesearch'], {})) - - # Don't add localization and country arguments if the user does select a - # language (:de, :en, ..) - - if len(params['searxng_locale'].split('-')) > 1: - # https://developers.dailymotion.com/api/#global-parameters - args['localization'] = eng_region - args['country'] = eng_region.split('_')[1] - # Insufficient rights for the `ams_country' parameter of route `GET /videos' - # 'ams_country': eng_region.split('_')[1], - - time_delta = time_delta_dict.get(params["time_range"]) - if time_delta: - created_after = datetime.now() - time_delta - args['created_after'] = datetime.timestamp(created_after) - - query_str = urlencode(args) - params['url'] = search_url + query_str - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_res = resp.json() - - # check for an API error - if 'error' in search_res: - raise SearxEngineAPIException(search_res['error'].get('message')) - - raise_for_httperror(resp) - - # parse results - for res in search_res.get('list', []): - - title = res['title'] - url = res['url'] - - content = html_to_text(res['description']) - if len(content) > 300: - content = content[:300] + '...' - - publishedDate = datetime.fromtimestamp(res['created_time'], None) - - length = time.gmtime(res.get('duration')) - if length.tm_hour: - length = time.strftime("%H:%M:%S", length) - else: - length = time.strftime("%M:%S", length) - - thumbnail = res['thumbnail_360_url'] - thumbnail = thumbnail.replace("http://", "https://") - - item = { - 'template': 'videos.html', - 'url': url, - 'title': title, - 'content': content, - 'publishedDate': publishedDate, - 'length': length, - 'thumbnail': thumbnail, - } - - # HINT: no mater what the value is, without API token videos can't shown - # embedded - if res['allow_embed']: - item['iframe_src'] = iframe_src.format(video_id=res['id']) - - results.append(item) - - # return results - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch locales & languages from dailymotion. - - Locales fetched from `api/locales `_. - There are duplications in the locale codes returned from Dailymotion which - can be ignored:: - - en_EN --> en_GB, en_US - ar_AA --> ar_EG, ar_AE, ar_SA - - The language list `api/languages `_ - contains over 7000 *languages* codes (see PR1071_). We use only those - language codes that are used in the locales. - - .. _PR1071: https://github.com/searxng/searxng/pull/1071 - - """ - - resp = get('https://api.dailymotion.com/locales') - if not resp.ok: # type: ignore - print("ERROR: response from dailymotion/locales is not OK.") - - for item in resp.json()['list']: # type: ignore - eng_tag = item['locale'] - if eng_tag in ('en_EN', 'ar_AA'): - continue - try: - sxng_tag = region_tag(babel.Locale.parse(eng_tag)) - except babel.UnknownLocaleError: - print("ERROR: item unknown --> %s" % item) - continue - - conflict = engine_traits.regions.get(sxng_tag) - if conflict: - if conflict != eng_tag: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) - continue - engine_traits.regions[sxng_tag] = eng_tag - - locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()] - - resp = get('https://api.dailymotion.com/languages') - if not resp.ok: # type: ignore - print("ERROR: response from dailymotion/languages is not OK.") - - for item in resp.json()['list']: # type: ignore - eng_tag = item['code'] - if eng_tag in locale_lang_list: - sxng_tag = language_tag(babel.Locale.parse(eng_tag)) - engine_traits.languages[sxng_tag] = eng_tag diff --git a/apps/searxng/searx/engines/deepl.py b/apps/searxng/searx/engines/deepl.py deleted file mode 100755 index 8507271..0000000 --- a/apps/searxng/searx/engines/deepl.py +++ /dev/null @@ -1,62 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Deepl translation engine""" - -from json import loads - -about = { - "website": 'https://deepl.com', - "wikidata_id": 'Q43968444', - "official_api_documentation": 'https://www.deepl.com/docs-api', - "use_official_api": True, - "require_api_key": True, - "results": 'JSON', -} - -engine_type = 'online_dictionary' -categories = ['general'] - -url = 'https://api-free.deepl.com/v2/translate' -api_key = None - - -def request(_query, params): - '''pre-request callback - - params: - - - ``method`` : POST/GET - - ``headers``: {} - - ``data``: {} # if method == POST - - ``url``: '' - - ``category``: 'search category' - - ``pageno``: 1 # number of the requested page - ''' - - params['url'] = url - params['method'] = 'POST' - params['data'] = {'auth_key': api_key, 'text': params['query'], 'target_lang': params['to_lang'][1]} - - return params - - -def response(resp): - results = [] - result = loads(resp.text) - translations = result['translations'] - - infobox = "
" - - for translation in translations: - infobox += f"
{translation['text']}
" - - infobox += "
" - - results.append( - { - 'infobox': 'Deepl', - 'content': infobox, - } - ) - - return results diff --git a/apps/searxng/searx/engines/deezer.py b/apps/searxng/searx/engines/deezer.py deleted file mode 100755 index 63c71e3..0000000 --- a/apps/searxng/searx/engines/deezer.py +++ /dev/null @@ -1,60 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Deezer (Music) -""" - -from json import loads -from urllib.parse import urlencode - -# about -about = { - "website": 'https://deezer.com', - "wikidata_id": 'Q602243', - "official_api_documentation": 'https://developers.deezer.com/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['music'] -paging = True - -# search-url -url = 'https://api.deezer.com/' -search_url = url + 'search?{query}&index={offset}' -iframe_src = "https://www.deezer.com/plugins/player?type=tracks&id={audioid}" - -# do search-request -def request(query, params): - offset = (params['pageno'] - 1) * 25 - - params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_res = loads(resp.text) - - # parse results - for result in search_res.get('data', []): - if result['type'] == 'track': - title = result['title'] - url = result['link'] - - if url.startswith('http://'): - url = 'https' + url[4:] - - content = '{} - {} - {}'.format(result['artist']['name'], result['album']['title'], result['title']) - - # append result - results.append( - {'url': url, 'title': title, 'iframe_src': iframe_src.format(audioid=result['id']), 'content': content} - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/demo_offline.py b/apps/searxng/searx/engines/demo_offline.py deleted file mode 100755 index 9d6e3b5..0000000 --- a/apps/searxng/searx/engines/demo_offline.py +++ /dev/null @@ -1,73 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Within this module we implement a *demo offline engine*. Do not look to -close to the implementation, its just a simple example. To get in use of this -*demo* engine add the following entry to your engines list in ``settings.yml``: - -.. code:: yaml - - - name: my offline engine - engine: demo_offline - shortcut: demo - disabled: false - -""" - -import json - -engine_type = 'offline' -categories = ['general'] -disabled = True -timeout = 2.0 - -about = { - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'JSON', -} - -# if there is a need for globals, use a leading underline -_my_offline_engine = None - - -def init(engine_settings=None): - """Initialization of the (offline) engine. The origin of this demo engine is a - simple json string which is loaded in this example while the engine is - initialized. - - """ - global _my_offline_engine # pylint: disable=global-statement - - _my_offline_engine = ( - '[ {"value": "%s"}' - ', {"value":"first item"}' - ', {"value":"second item"}' - ', {"value":"third item"}' - ']' % engine_settings.get('name') - ) - - -def search(query, request_params): - """Query (offline) engine and return results. Assemble the list of results from - your local engine. In this demo engine we ignore the 'query' term, usual - you would pass the 'query' term to your local engine to filter out the - results. - - """ - ret_val = [] - - result_list = json.loads(_my_offline_engine) - - for row in result_list: - entry = { - 'query': query, - 'language': request_params['searxng_locale'], - 'value': row.get("value"), - # choose a result template or comment out to use the *default* - 'template': 'key-value.html', - } - ret_val.append(entry) - - return ret_val diff --git a/apps/searxng/searx/engines/demo_online.py b/apps/searxng/searx/engines/demo_online.py deleted file mode 100755 index 08add53..0000000 --- a/apps/searxng/searx/engines/demo_online.py +++ /dev/null @@ -1,100 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Within this module we implement a *demo online engine*. Do not look to -close to the implementation, its just a simple example which queries `The Art -Institute of Chicago `_ - -To get in use of this *demo* engine add the following entry to your engines -list in ``settings.yml``: - -.. code:: yaml - - - name: my online engine - engine: demo_online - shortcut: demo - disabled: false - -""" - -from json import loads -from urllib.parse import urlencode - -engine_type = 'online' -send_accept_language_header = True -categories = ['general'] -disabled = True -timeout = 2.0 -categories = ['images'] -paging = True -page_size = 20 - -search_api = 'https://api.artic.edu/api/v1/artworks/search?' -image_api = 'https://www.artic.edu/iiif/2/' - -about = { - "website": 'https://www.artic.edu', - "wikidata_id": 'Q239303', - "official_api_documentation": 'http://api.artic.edu/docs/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - - -# if there is a need for globals, use a leading underline -_my_online_engine = None - - -def init(engine_settings): - """Initialization of the (online) engine. If no initialization is needed, drop - this init function. - - """ - global _my_online_engine # pylint: disable=global-statement - _my_online_engine = engine_settings.get('name') - - -def request(query, params): - """Build up the ``params`` for the online request. In this example we build a - URL to fetch images from `artic.edu `__ - - """ - args = urlencode( - { - 'q': query, - 'page': params['pageno'], - 'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles', - 'limit': page_size, - } - ) - params['url'] = search_api + args - return params - - -def response(resp): - """Parse out the result items from the response. In this example we parse the - response from `api.artic.edu `__ and filter out all - images. - - """ - results = [] - json_data = loads(resp.text) - - for result in json_data['data']: - - if not result['image_id']: - continue - - results.append( - { - 'url': 'https://artic.edu/artworks/%(id)s' % result, - 'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result, - 'content': result['medium_display'], - 'author': ', '.join(result['artist_titles']), - 'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result, - 'img_format': result['dimensions'], - 'template': 'images.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/deviantart.py b/apps/searxng/searx/engines/deviantart.py deleted file mode 100755 index e44ac28..0000000 --- a/apps/searxng/searx/engines/deviantart.py +++ /dev/null @@ -1,81 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" - Deviantart (Images) -""" - -from urllib.parse import urlencode -from lxml import html - -# about -about = { - "website": 'https://www.deviantart.com/', - "wikidata_id": 'Q46523', - "official_api_documentation": 'https://www.deviantart.com/developers/', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['images'] -paging = True -time_range_support = True - -time_range_dict = { - 'day': 'popular-24-hours', - 'week': 'popular-1-week', - 'month': 'popular-1-month', - 'year': 'most-recent', -} - -# search-url -base_url = 'https://www.deviantart.com' - - -def request(query, params): - - # https://www.deviantart.com/search/deviations?page=5&q=foo - - query = { - 'page': params['pageno'], - 'q': query, - } - if params['time_range'] in time_range_dict: - query['order'] = time_range_dict[params['time_range']] - - params['url'] = base_url + '/search/deviations?' + urlencode(query) - - return params - - -def response(resp): - - results = [] - - dom = html.fromstring(resp.text) - - for row in dom.xpath('//div[contains(@data-hook, "content_row")]'): - for result in row.xpath('./div'): - - a_tag = result.xpath('.//a[@data-hook="deviation_link"]')[0] - noscript_tag = a_tag.xpath('.//noscript') - - if noscript_tag: - img_tag = noscript_tag[0].xpath('.//img') - else: - img_tag = a_tag.xpath('.//img') - if not img_tag: - continue - img_tag = img_tag[0] - - results.append( - { - 'template': 'images.html', - 'url': a_tag.attrib.get('href'), - 'img_src': img_tag.attrib.get('src'), - 'title': img_tag.attrib.get('alt'), - } - ) - - return results diff --git a/apps/searxng/searx/engines/dictzone.py b/apps/searxng/searx/engines/dictzone.py deleted file mode 100755 index 126e753..0000000 --- a/apps/searxng/searx/engines/dictzone.py +++ /dev/null @@ -1,60 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Dictzone -""" - -from urllib.parse import urljoin -from lxml import html -from searx.utils import eval_xpath - -# about -about = { - "website": 'https://dictzone.com/', - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -engine_type = 'online_dictionary' -categories = ['general'] -url = 'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' -weight = 100 - -results_xpath = './/table[@id="r"]/tr' -https_support = True - - -def request(query, params): - params['url'] = url.format(from_lang=params['from_lang'][2], to_lang=params['to_lang'][2], query=params['query']) - - return params - - -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]): - try: - from_result, to_results_raw = eval_xpath(result, './td') - except: - continue - - to_results = [] - for to_result in eval_xpath(to_results_raw, './p/a'): - t = to_result.text_content() - if t.strip(): - to_results.append(to_result.text_content()) - - results.append( - { - 'url': urljoin(str(resp.url), '?%d' % k), - 'title': from_result.text_content(), - 'content': '; '.join(to_results), - } - ) - - return results diff --git a/apps/searxng/searx/engines/digbt.py b/apps/searxng/searx/engines/digbt.py deleted file mode 100755 index 2914e92..0000000 --- a/apps/searxng/searx/engines/digbt.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - DigBT (Videos, Music, Files) -""" - -from urllib.parse import urljoin -from lxml import html -from searx.utils import extract_text, get_torrent_size - -# about -about = { - "website": 'https://digbt.org', - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -categories = ['videos', 'music', 'files'] -paging = True - -URL = 'https://digbt.org' -SEARCH_URL = URL + '/search/{query}-time-{pageno}' -FILESIZE = 3 -FILESIZE_MULTIPLIER = 4 - - -def request(query, params): - params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno']) - - return params - - -def response(resp): - dom = html.fromstring(resp.text) - search_res = dom.xpath('.//td[@class="x-item"]') - - if not search_res: - return list() - - results = list() - for result in search_res: - url = urljoin(URL, result.xpath('.//a[@title]/@href')[0]) - title = extract_text(result.xpath('.//a[@title]')) - content = extract_text(result.xpath('.//div[@class="files"]')) - files_data = extract_text(result.xpath('.//div[@class="tail"]')).split() - filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER]) - magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0] - - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'filesize': filesize, - 'magnetlink': magnetlink, - 'seed': 'N/A', - 'leech': 'N/A', - 'template': 'torrent.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/docker_hub.py b/apps/searxng/searx/engines/docker_hub.py deleted file mode 100755 index cde96d0..0000000 --- a/apps/searxng/searx/engines/docker_hub.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Docker Hub (IT) - -""" -# pylint: disable=use-dict-literal - -from json import loads -from urllib.parse import urlencode -from dateutil import parser - -about = { - "website": 'https://hub.docker.com', - "wikidata_id": 'Q100769064', - "official_api_documentation": 'https://docs.docker.com/registry/spec/api/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['it'] # optional -paging = True - -base_url = "https://hub.docker.com/" -search_url = base_url + "api/content/v1/products/search?{query}&type=image&page_size=25" - - -def request(query, params): - - params['url'] = search_url.format(query=urlencode(dict(q=query, page=params["pageno"]))) - params["headers"]["Search-Version"] = "v3" - - return params - - -def response(resp): - '''post-response callback - resp: requests response object - ''' - results = [] - body = loads(resp.text) - - # Make sure `summaries` isn't `null` - search_res = body.get("summaries") - if search_res: - for item in search_res: - result = {} - - # Make sure correct URL is set - filter_type = item.get("filter_type") - is_official = filter_type in ["store", "official"] - - if is_official: - result["url"] = base_url + "_/" + item.get('slug', "") - else: - result["url"] = base_url + "r/" + item.get('slug', "") - result["title"] = item.get("name") - result["content"] = item.get("short_description") - result["publishedDate"] = parser.parse(item.get("updated_at") or item.get("created_at")) - result["thumbnail"] = item["logo_url"].get("large") or item["logo_url"].get("small") - results.append(result) - - return results diff --git a/apps/searxng/searx/engines/doku.py b/apps/searxng/searx/engines/doku.py deleted file mode 100755 index 08f56bb..0000000 --- a/apps/searxng/searx/engines/doku.py +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Doku Wiki -""" - -from urllib.parse import urlencode -from lxml.html import fromstring -from searx.utils import extract_text, eval_xpath - -# about -about = { - "website": 'https://www.dokuwiki.org/', - "wikidata_id": 'Q851864', - "official_api_documentation": 'https://www.dokuwiki.org/devel:xmlrpc', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['general'] # TODO , 'images', 'music', 'videos', 'files' -paging = False -number_of_results = 5 - -# search-url -# Doku is OpenSearch compatible -base_url = 'http://localhost:8090' -search_url = ( - # fmt: off - '/?do=search' - '&{query}' - # fmt: on -) -# TODO '&startRecord={offset}' -# TODO '&maximumRecords={limit}' - - -# do search-request -def request(query, params): - - params['url'] = base_url + search_url.format(query=urlencode({'id': query})) - - return params - - -# get response from search-request -def response(resp): - results = [] - - doc = fromstring(resp.text) - - # parse results - # Quickhits - for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'): - try: - res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] - except: - continue - - if not res_url: - continue - - title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title')) - - # append result - results.append({'title': title, 'content': "", 'url': base_url + res_url}) - - # Search results - for r in eval_xpath(doc, '//dl[@class="search_results"]/*'): - try: - if r.tag == "dt": - res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] - title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title')) - elif r.tag == "dd": - content = extract_text(eval_xpath(r, '.')) - - # append result - results.append({'title': title, 'content': content, 'url': base_url + res_url}) - except: - continue - - if not res_url: - continue - - # return results - return results diff --git a/apps/searxng/searx/engines/duckduckgo.py b/apps/searxng/searx/engines/duckduckgo.py deleted file mode 100755 index 8349ad8..0000000 --- a/apps/searxng/searx/engines/duckduckgo.py +++ /dev/null @@ -1,437 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -DuckDuckGo Lite -~~~~~~~~~~~~~~~ -""" - -from typing import TYPE_CHECKING -import re -from urllib.parse import urlencode -import json -import babel -import lxml.html - -from searx import ( - locales, - redislib, - external_bang, -) -from searx.utils import ( - eval_xpath, - eval_xpath_getindex, - extract_text, -) -from searx.network import get # see https://github.com/searxng/searxng/issues/762 -from searx import redisdb -from searx.enginelib.traits import EngineTraits -from searx.exceptions import SearxEngineAPIException - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -about = { - "website": 'https://lite.duckduckgo.com/lite/', - "wikidata_id": 'Q12805', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -send_accept_language_header = True -"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP -``Accept-Language``. Optional the user can select a region filter (but not a -language). -""" - -# engine dependent config -categories = ['general', 'web'] -paging = True -time_range_support = True -safesearch = True # user can't select but the results are filtered - -url = 'https://lite.duckduckgo.com/lite/' -# url_ping = 'https://duckduckgo.com/t/sl_l' - -time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} -form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'} - - -def cache_vqd(query, value): - """Caches a ``vqd`` value from a query. - - The vqd value depends on the query string and is needed for the follow up - pages or the images loaded by a XMLHttpRequest: - - - DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...` - - DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...` - - """ - c = redisdb.client() - if c: - logger.debug("cache vqd value: %s", value) - key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) - c.set(key, value, ex=600) - - -def get_vqd(query, headers): - """Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached - (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the - response. - - """ - value = None - c = redisdb.client() - if c: - key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) - value = c.get(key) - if value: - value = value.decode('utf-8') - logger.debug("re-use cached vqd value: %s", value) - return value - - query_url = 'https://duckduckgo.com/?q={query}&atb=v290-5'.format(query=urlencode({'q': query})) - res = get(query_url, headers=headers) - content = res.text # type: ignore - if content.find('vqd=\"') == -1: - raise SearxEngineAPIException('Request failed') - value = content[content.find('vqd=\"') + 5 :] - value = value[: value.find('\'')] - logger.debug("new vqd value: %s", value) - cache_vqd(query, value) - return value - - -def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): - """Get DuckDuckGo's language identifier from SearXNG's locale. - - DuckDuckGo defines its lanaguages by region codes (see - :py:obj:`fetch_traits`). - - To get region and language of a DDG service use: - - .. code: python - - eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) - eng_lang = get_ddg_lang(traits, params['searxng_locale']) - - It might confuse, but the ``l`` value of the cookie is what SearXNG calls - the *region*: - - .. code:: python - - # !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} - params['cookies']['ad'] = eng_lang - params['cookies']['ah'] = eng_region - params['cookies']['l'] = eng_region - - .. hint:: - - `DDG-lite `__ does not offer a language - selection to the user, only a region can be selected by the user - (``eng_region`` from the example above). DDG-lite stores the selected - region in a cookie:: - - params['cookies']['kl'] = eng_region # 'ar-es' - - """ - return eng_traits.custom['lang_region'].get( # type: ignore - sxng_locale, eng_traits.get_language(sxng_locale, default) - ) - - -ddg_reg_map = { - 'tw-tzh': 'zh_TW', - 'hk-tzh': 'zh_HK', - 'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES - 'es-ca': 'ca_ES', - 'id-en': 'id_ID', - 'no-no': 'nb_NO', - 'jp-jp': 'ja_JP', - 'kr-kr': 'ko_KR', - 'xa-ar': 'ar_SA', - 'sl-sl': 'sl_SI', - 'th-en': 'th_TH', - 'vn-en': 'vi_VN', -} - -ddg_lang_map = { - # use ar --> ar_EG (Egypt's arabic) - "ar_DZ": 'lang_region', - "ar_JO": 'lang_region', - "ar_SA": 'lang_region', - # use bn --> bn_BD - 'bn_IN': 'lang_region', - # use de --> de_DE - 'de_CH': 'lang_region', - # use en --> en_US, - 'en_AU': 'lang_region', - 'en_CA': 'lang_region', - 'en_GB': 'lang_region', - # Esperanto - 'eo_XX': 'eo', - # use es --> es_ES, - 'es_AR': 'lang_region', - 'es_CL': 'lang_region', - 'es_CO': 'lang_region', - 'es_CR': 'lang_region', - 'es_EC': 'lang_region', - 'es_MX': 'lang_region', - 'es_PE': 'lang_region', - 'es_UY': 'lang_region', - 'es_VE': 'lang_region', - # use fr --> rf_FR - 'fr_CA': 'lang_region', - 'fr_CH': 'lang_region', - 'fr_BE': 'lang_region', - # use nl --> nl_NL - 'nl_BE': 'lang_region', - # use pt --> pt_PT - 'pt_BR': 'lang_region', - # skip these languages - 'od_IN': 'skip', - 'io_XX': 'skip', - 'tokipona_XX': 'skip', -} - - -def request(query, params): - - # quote ddg bangs - query_parts = [] - # for val in re.split(r'(\s+)', query): - for val in re.split(r'(\s+)', query): - if not val.strip(): - continue - if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]): - val = f"'{val}'" - query_parts.append(val) - query = ' '.join(query_parts) - - eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) - # eng_lang = get_ddg_lang(traits, params['searxng_locale']) - - params['url'] = url - params['method'] = 'POST' - params['data']['q'] = query - - # The API is not documented, so we do some reverse engineering and emulate - # what https://lite.duckduckgo.com/lite/ does when you press "next Page" - # link again and again .. - - params['headers']['Content-Type'] = 'application/x-www-form-urlencoded' - params['headers']['Referer'] = 'https://google.com/' - - # initial page does not have an offset - if params['pageno'] == 2: - # second page does have an offset of 30 - offset = (params['pageno'] - 1) * 30 - params['data']['s'] = offset - params['data']['dc'] = offset + 1 - - elif params['pageno'] > 2: - # third and following pages do have an offset of 30 + n*50 - offset = 30 + (params['pageno'] - 2) * 50 - params['data']['s'] = offset - params['data']['dc'] = offset + 1 - - # request needs a vqd argument - params['data']['vqd'] = get_vqd(query, params["headers"]) - - # initial page does not have additional data in the input form - if params['pageno'] > 1: - - params['data']['o'] = form_data.get('o', 'json') - params['data']['api'] = form_data.get('api', 'd.js') - params['data']['nextParams'] = form_data.get('nextParams', '') - params['data']['v'] = form_data.get('v', 'l') - - params['data']['kl'] = eng_region - params['cookies']['kl'] = eng_region - - params['data']['df'] = '' - if params['time_range'] in time_range_dict: - params['data']['df'] = time_range_dict[params['time_range']] - params['cookies']['df'] = time_range_dict[params['time_range']] - - logger.debug("param data: %s", params['data']) - logger.debug("param cookies: %s", params['cookies']) - return params - - -def response(resp): - - if resp.status_code == 303: - return [] - - results = [] - doc = lxml.html.fromstring(resp.text) - - result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') - - if len(result_table) == 2: - # some locales (at least China) does not have a "next page" button and - # the layout of the HTML tables is different. - result_table = result_table[1] - elif not len(result_table) >= 3: - # no more results - return [] - else: - result_table = result_table[2] - # update form data from response - form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..') - if len(form): - - form = form[0] - form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0] - form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0] - form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0] - logger.debug('form_data: %s', form_data) - - value = eval_xpath(form, '//input[@name="vqd"]/@value')[0] - query = resp.search_params['data']['q'] - cache_vqd(query, value) - - tr_rows = eval_xpath(result_table, './/tr') - # In the last is the form of the 'previous/next page' links - tr_rows = tr_rows[:-1] - - len_tr_rows = len(tr_rows) - offset = 0 - - while len_tr_rows >= offset + 4: - - # assemble table rows we need to scrap - tr_title = tr_rows[offset] - tr_content = tr_rows[offset + 1] - offset += 4 - - # ignore sponsored Adds - if tr_content.get('class') == 'result-sponsored': - continue - - a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None) - if a_tag is None: - continue - - td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None) - if td_content is None: - continue - - results.append( - { - 'title': a_tag.text_content(), - 'content': extract_text(td_content), - 'url': a_tag.get('href'), - } - ) - - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages & regions from DuckDuckGo. - - SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``). - DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no - sense in a SearXNG request since SearXNG's ``all`` will not add a - ``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale`` - is ``wt-wt`` (the region). - - Beside regions DuckDuckGo also defines its lanaguages by region codes. By - example these are the english languages in DuckDuckGo: - - - en_US - - en_AU - - en_CA - - en_GB - - The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from - SearXNG's locale. - - """ - # pylint: disable=too-many-branches, too-many-statements - # fetch regions - - engine_traits.all_locale = 'wt-wt' - - # updated from u588 to u661 / should be updated automatically? - resp = get('https://duckduckgo.com/util/u661.js') - - if not resp.ok: # type: ignore - print("ERROR: response from DuckDuckGo is not OK.") - - pos = resp.text.find('regions:{') + 8 # type: ignore - js_code = resp.text[pos:] # type: ignore - pos = js_code.find('}') + 1 - regions = json.loads(js_code[:pos]) - - for eng_tag, name in regions.items(): - - if eng_tag == 'wt-wt': - engine_traits.all_locale = 'wt-wt' - continue - - region = ddg_reg_map.get(eng_tag) - if region == 'skip': - continue - - if not region: - eng_territory, eng_lang = eng_tag.split('-') - region = eng_lang + '_' + eng_territory.upper() - - try: - sxng_tag = locales.region_tag(babel.Locale.parse(region)) - except babel.UnknownLocaleError: - print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region)) - continue - - conflict = engine_traits.regions.get(sxng_tag) - if conflict: - if conflict != eng_tag: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) - continue - engine_traits.regions[sxng_tag] = eng_tag - - # fetch languages - - engine_traits.custom['lang_region'] = {} - - pos = resp.text.find('languages:{') + 10 # type: ignore - js_code = resp.text[pos:] # type: ignore - pos = js_code.find('}') + 1 - js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"') - languages = json.loads(js_code) - - for eng_lang, name in languages.items(): - - if eng_lang == 'wt_WT': - continue - - babel_tag = ddg_lang_map.get(eng_lang, eng_lang) - if babel_tag == 'skip': - continue - - try: - - if babel_tag == 'lang_region': - sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang)) - engine_traits.custom['lang_region'][sxng_tag] = eng_lang - continue - - sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag)) - - except babel.UnknownLocaleError: - print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang)) - continue - - conflict = engine_traits.languages.get(sxng_tag) - if conflict: - if conflict != eng_lang: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) - continue - engine_traits.languages[sxng_tag] = eng_lang diff --git a/apps/searxng/searx/engines/duckduckgo_definitions.py b/apps/searxng/searx/engines/duckduckgo_definitions.py deleted file mode 100755 index 39fed87..0000000 --- a/apps/searxng/searx/engines/duckduckgo_definitions.py +++ /dev/null @@ -1,255 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -DuckDuckGo Instant Answer API -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The `DDG-API `__ is no longer documented but from -reverse engineering we can see that some services (e.g. instant answers) still -in use from the DDG search engine. - -As far we can say the *instant answers* API does not support languages, or at -least we could not find out how language support should work. It seems that -most of the features are based on English terms. - -""" - -from typing import TYPE_CHECKING - -from urllib.parse import urlencode, urlparse, urljoin -from lxml import html - -from searx.data import WIKIDATA_UNITS -from searx.utils import extract_text, html_to_text, get_string_replaces_function -from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -# about -about = { - "website": 'https://duckduckgo.com/', - "wikidata_id": 'Q12805', - "official_api_documentation": 'https://duckduckgo.com/api', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -send_accept_language_header = True - -URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1' - -WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/'] - -replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) - - -def is_broken_text(text): - """duckduckgo may return something like ``
http://somewhere Related website`` - - The href URL is broken, the "Related website" may contains some HTML. - - The best solution seems to ignore these results. - """ - return text.startswith('http') and ' ' in text - - -def result_to_text(text, htmlResult): - # TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme - result = None - dom = html.fromstring(htmlResult) - a = dom.xpath('//a') - if len(a) >= 1: - result = extract_text(a[0]) - else: - result = text - if not is_broken_text(result): - return result - return None - - -def request(query, params): - params['url'] = URL.format(query=urlencode({'q': query})) - return params - - -def response(resp): - # pylint: disable=too-many-locals, too-many-branches, too-many-statements - results = [] - - search_res = resp.json() - - # search_res.get('Entity') possible values (not exhaustive) : - # * continent / country / department / location / waterfall - # * actor / musician / artist - # * book / performing art / film / television / media franchise / concert tour / playwright - # * prepared food - # * website / software / os / programming language / file format / software engineer - # * company - - content = '' - heading = search_res.get('Heading', '') - attributes = [] - urls = [] - infobox_id = None - relatedTopics = [] - - # add answer if there is one - answer = search_res.get('Answer', '') - if answer: - logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer) - if search_res.get('AnswerType') not in ['calc', 'ip']: - results.append({'answer': html_to_text(answer)}) - - # add infobox - if 'Definition' in search_res: - content = content + search_res.get('Definition', '') - - if 'Abstract' in search_res: - content = content + search_res.get('Abstract', '') - - # image - image = search_res.get('Image') - image = None if image == '' else image - if image is not None and urlparse(image).netloc == '': - image = urljoin('https://duckduckgo.com', image) - - # urls - # Official website, Wikipedia page - for ddg_result in search_res.get('Results', []): - firstURL = ddg_result.get('FirstURL') - text = ddg_result.get('Text') - if firstURL is not None and text is not None: - urls.append({'title': text, 'url': firstURL}) - results.append({'title': heading, 'url': firstURL}) - - # related topics - for ddg_result in search_res.get('RelatedTopics', []): - if 'FirstURL' in ddg_result: - firstURL = ddg_result.get('FirstURL') - text = ddg_result.get('Text') - if not is_broken_text(text): - suggestion = result_to_text(text, ddg_result.get('Result')) - if suggestion != heading and suggestion is not None: - results.append({'suggestion': suggestion}) - elif 'Topics' in ddg_result: - suggestions = [] - relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions}) - for topic_result in ddg_result.get('Topics', []): - suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result')) - if suggestion != heading and suggestion is not None: - suggestions.append(suggestion) - - # abstract - abstractURL = search_res.get('AbstractURL', '') - if abstractURL != '': - # add as result ? problem always in english - infobox_id = abstractURL - urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True}) - results.append({'url': abstractURL, 'title': heading}) - - # definition - definitionURL = search_res.get('DefinitionURL', '') - if definitionURL != '': - # add as result ? as answer ? problem always in english - infobox_id = definitionURL - urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL}) - - # to merge with wikidata's infobox - if infobox_id: - infobox_id = replace_http_by_https(infobox_id) - - # attributes - # some will be converted to urls - if 'Infobox' in search_res: - infobox = search_res.get('Infobox') - if 'content' in infobox: - osm_zoom = 17 - coordinates = None - for info in infobox.get('content'): - data_type = info.get('data_type') - data_label = info.get('label') - data_value = info.get('value') - - # Workaround: ddg may return a double quote - if data_value == '""': - continue - - # Is it an external URL ? - # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile - # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id - # * netflix_id - external_url = get_external_url(data_type, data_value) - if external_url is not None: - urls.append({'title': data_label, 'url': external_url}) - elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']: - # ignore instance: Wikidata value from "Instance Of" (Qxxxx) - # ignore wiki_maps_trigger: reference to a javascript - # ignore google_play_artist_id: service shutdown - pass - elif data_type == 'string' and data_label == 'Website': - # There is already an URL for the website - pass - elif data_type == 'area': - attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'}) - osm_zoom = area_to_osm_zoom(data_value.get('amount')) - elif data_type == 'coordinates': - if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2': - # coordinate on Earth - # get the zoom information from the area - coordinates = info - else: - # coordinate NOT on Earth - attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'}) - elif data_type == 'string': - attributes.append({'label': data_label, 'value': data_value}) - - if coordinates: - data_label = coordinates.get('label') - data_value = coordinates.get('value') - latitude = data_value.get('latitude') - longitude = data_value.get('longitude') - url = get_earth_coordinates_url(latitude, longitude, osm_zoom) - urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'}) - - if len(heading) > 0: - # TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme - if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0: - results.append({'url': urls[0]['url'], 'title': heading, 'content': content}) - else: - results.append( - { - 'infobox': heading, - 'id': infobox_id, - 'content': content, - 'img_src': image, - 'attributes': attributes, - 'urls': urls, - 'relatedTopics': relatedTopics, - } - ) - - return results - - -def unit_to_str(unit): - for prefix in WIKIDATA_PREFIX: - if unit.startswith(prefix): - wikidata_entity = unit[len(prefix) :] - return WIKIDATA_UNITS.get(wikidata_entity, unit) - return unit - - -def area_to_str(area): - """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``""" - unit = unit_to_str(area.get('unit')) - if unit is not None: - try: - amount = float(area.get('amount')) - return '{} {}'.format(amount, unit) - except ValueError: - pass - return '{} {}'.format(area.get('amount', ''), area.get('unit', '')) diff --git a/apps/searxng/searx/engines/duckduckgo_images.py b/apps/searxng/searx/engines/duckduckgo_images.py deleted file mode 100755 index d8a6f13..0000000 --- a/apps/searxng/searx/engines/duckduckgo_images.py +++ /dev/null @@ -1,100 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" -DuckDuckGo Images -~~~~~~~~~~~~~~~~~ -""" - -from typing import TYPE_CHECKING -from urllib.parse import urlencode - -from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import -from searx.engines.duckduckgo import ( - get_ddg_lang, - get_vqd, -) -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -# about -about = { - "website": 'https://duckduckgo.com/', - "wikidata_id": 'Q12805', - "use_official_api": False, - "require_api_key": False, - "results": 'JSON (site requires js to get images)', -} - -# engine dependent config -categories = ['images', 'web'] -paging = True -safesearch = True -send_accept_language_header = True - -safesearch_cookies = {0: '-2', 1: None, 2: '1'} -safesearch_args = {0: '1', 1: None, 2: '1'} - - -def request(query, params): - - eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) - eng_lang = get_ddg_lang(traits, params['searxng_locale']) - - args = { - 'q': query, - 'o': 'json', - # 'u': 'bing', - 'l': eng_region, - 'vqd': get_vqd(query, params["headers"]), - } - - if params['pageno'] > 1: - args['s'] = (params['pageno'] - 1) * 100 - - params['cookies']['ad'] = eng_lang # zh_CN - params['cookies']['ah'] = eng_region # "us-en,de-de" - params['cookies']['l'] = eng_region # "hk-tzh" - logger.debug("cookies: %s", params['cookies']) - - safe_search = safesearch_cookies.get(params['safesearch']) - if safe_search is not None: - params['cookies']['p'] = safe_search # "-2", "1" - safe_search = safesearch_args.get(params['safesearch']) - if safe_search is not None: - args['p'] = safe_search # "-1", "1" - - args = urlencode(args) - params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,') - - params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01' - params['headers']['Referer'] = 'https://duckduckgo.com/' - params['headers']['X-Requested-With'] = 'XMLHttpRequest' - logger.debug("headers: %s", params['headers']) - - return params - - -def response(resp): - results = [] - res_json = resp.json() - - for result in res_json['results']: - results.append( - { - 'template': 'images.html', - 'title': result['title'], - 'content': '', - 'thumbnail_src': result['thumbnail'], - 'img_src': result['image'], - 'url': result['url'], - 'img_format': '%s x %s' % (result['width'], result['height']), - 'source': result['source'], - } - ) - - return results diff --git a/apps/searxng/searx/engines/duckduckgo_weather.py b/apps/searxng/searx/engines/duckduckgo_weather.py deleted file mode 100755 index f239ce8..0000000 --- a/apps/searxng/searx/engines/duckduckgo_weather.py +++ /dev/null @@ -1,163 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -DuckDuckGo Weather -~~~~~~~~~~~~~~~~~~ -""" - -from typing import TYPE_CHECKING -from json import loads -from urllib.parse import quote - -from datetime import datetime -from flask_babel import gettext - -from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import -from searx.engines.duckduckgo import get_ddg_lang -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - - -about = { - "website": 'https://duckduckgo.com/', - "wikidata_id": 'Q12805', - "official_api_documentation": None, - "use_official_api": True, - "require_api_key": False, - "results": "JSON", -} - -send_accept_language_header = True - -# engine dependent config -categories = ["weather"] -URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" - - -def generate_condition_table(condition): - res = "" - - res += f"{gettext('Condition')}" f"{condition['summary']}" - - res += ( - f"{gettext('Temperature')}" - f"{f_to_c(condition['temperature'])}°C / {condition['temperature']}°F" - ) - - res += ( - f"{gettext('Feels like')}{f_to_c(condition['apparentTemperature'])}°C / " - f"{condition['apparentTemperature']}°F" - ) - - res += ( - f"{gettext('Wind')}{condition['windBearing']}° — " - f"{(condition['windSpeed'] * 1.6093440006147):.2f} km/h / {condition['windSpeed']} mph" - ) - - res += f"{gettext('Visibility')}{condition['visibility']} km" - - res += f"{gettext('Humidity')}{(condition['humidity'] * 100):.1f}%" - - return res - - -def generate_day_table(day): - res = "" - - res += ( - f"{gettext('Min temp.')}{f_to_c(day['temperatureLow'])}°C / " - f"{day['temperatureLow']}°F" - ) - res += ( - f"{gettext('Max temp.')}{f_to_c(day['temperatureHigh'])}°C / " - f"{day['temperatureHigh']}°F" - ) - res += f"{gettext('UV index')}{day['uvIndex']}" - res += ( - f"{gettext('Sunrise')}{datetime.fromtimestamp(day['sunriseTime']).strftime('%H:%M')}" - ) - res += ( - f"{gettext('Sunset')}{datetime.fromtimestamp(day['sunsetTime']).strftime('%H:%M')}" - ) - - return res - - -def request(query, params): - - eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) - eng_lang = get_ddg_lang(traits, params['searxng_locale']) - - # !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} - params['cookies']['ad'] = eng_lang - params['cookies']['ah'] = eng_region - params['cookies']['l'] = eng_region - logger.debug("cookies: %s", params['cookies']) - - params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0]) - return params - - -def f_to_c(temperature): - return "%.2f" % ((temperature - 32) / 1.8) - - -def response(resp): - results = [] - - if resp.text.strip() == "ddg_spice_forecast();": - return [] - - result = loads(resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2]) - - current = result["currently"] - - title = result['flags']['ddg-location'] - - infobox = f"

{gettext('Current condition')}

" - - infobox += generate_condition_table(current) - - infobox += "
" - - last_date = None - - for time in result['hourly']['data']: - current_time = datetime.fromtimestamp(time['time']) - - if last_date != current_time.date(): - if last_date is not None: - infobox += "" - - infobox += f"

{current_time.strftime('%Y-%m-%d')}

" - - infobox += "" - - for day in result['daily']['data']: - if datetime.fromtimestamp(day['time']).date() == current_time.date(): - infobox += generate_day_table(day) - - infobox += "
" - - last_date = current_time.date() - - infobox += f"" - - infobox += generate_condition_table(time) - - infobox += "
{current_time.strftime('%H:%M')}
" - - results.append( - { - "infobox": title, - "content": infobox, - } - ) - - return results diff --git a/apps/searxng/searx/engines/duden.py b/apps/searxng/searx/engines/duden.py deleted file mode 100755 index dca5664..0000000 --- a/apps/searxng/searx/engines/duden.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Duden -""" - -import re -from urllib.parse import quote, urljoin -from lxml import html -from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex -from searx.network import raise_for_httperror - -# about -about = { - "website": 'https://www.duden.de', - "wikidata_id": 'Q73624591', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', - "language": 'de', -} - -categories = ['dictionaries'] -paging = True - -# search-url -base_url = 'https://www.duden.de/' -search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}' - - -def request(query, params): - '''pre-request callback - params: - method : POST/GET - headers : {} - data : {} # if method == POST - url : '' - category: 'search category' - pageno : 1 # number of the requested page - ''' - - offset = params['pageno'] - 1 - if offset == 0: - search_url_fmt = base_url + 'suchen/dudenonline/{query}' - params['url'] = search_url_fmt.format(query=quote(query)) - else: - params['url'] = search_url.format(offset=offset, query=quote(query)) - # after the last page of results, spelling corrections are returned after a HTTP redirect - # whatever the page number is - params['soft_max_redirects'] = 1 - params['raise_for_httperror'] = False - return params - - -def response(resp): - '''post-response callback - resp: requests response object - ''' - results = [] - - if resp.status_code == 404: - return results - - raise_for_httperror(resp) - - dom = html.fromstring(resp.text) - - number_of_results_element = eval_xpath_getindex( - dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None - ) - if number_of_results_element is not None: - number_of_results_string = re.sub('[^0-9]', '', number_of_results_element) - results.append({'number_of_results': int(number_of_results_string)}) - - for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'): - url = eval_xpath_getindex(result, './/h2/a', 0).get('href') - url = urljoin(base_url, url) - title = eval_xpath(result, 'string(.//h2/a)').strip() - content = extract_text(eval_xpath(result, './/p')) - # append result - results.append({'url': url, 'title': title, 'content': content}) - - return results diff --git a/apps/searxng/searx/engines/dummy-offline.py b/apps/searxng/searx/engines/dummy-offline.py deleted file mode 100755 index 632eeb2..0000000 --- a/apps/searxng/searx/engines/dummy-offline.py +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Dummy Offline -""" - - -# about -about = { - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - - -def search(query, request_params): - return [ - { - 'result': 'this is what you get', - } - ] diff --git a/apps/searxng/searx/engines/dummy.py b/apps/searxng/searx/engines/dummy.py deleted file mode 100755 index 1a1b57d..0000000 --- a/apps/searxng/searx/engines/dummy.py +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Dummy -""" - -# about -about = { - "website": None, - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'empty array', -} - - -# do search-request -def request(query, params): - return params - - -# get response from search-request -def response(resp): - return [] diff --git a/apps/searxng/searx/engines/ebay.py b/apps/searxng/searx/engines/ebay.py deleted file mode 100755 index 07870f0..0000000 --- a/apps/searxng/searx/engines/ebay.py +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Ebay (Videos, Music, Files) -""" - -from lxml import html -from searx.engines.xpath import extract_text -from urllib.parse import quote - -# about -about = { - "website": 'https://www.ebay.com', - "wikidata_id": 'Q58024', - "official_api_documentation": 'https://developer.ebay.com/', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -categories = ['shopping'] -paging = True - -# Set base_url in settings.yml in order to -# have the desired local TLD. -base_url = None -search_url = '/sch/i.html?_nkw={query}&_sacat={pageno}' - -results_xpath = '//li[contains(@class, "s-item")]' -url_xpath = './/a[@class="s-item__link"]/@href' -title_xpath = './/h3[@class="s-item__title"]' -content_xpath = './/div[@span="SECONDARY_INFO"]' -price_xpath = './/div[contains(@class, "s-item__detail")]/span[@class="s-item__price"][1]/text()' -shipping_xpath = './/span[contains(@class, "s-item__shipping")]/text()' -source_country_xpath = './/span[contains(@class, "s-item__location")]/text()' -thumbnail_xpath = './/img[@class="s-item__image-img"]/@src' - - -def request(query, params): - params['url'] = f'{base_url}' + search_url.format(query=quote(query), pageno=params['pageno']) - return params - - -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - results_dom = dom.xpath(results_xpath) - if not results_dom: - return [] - - for result_dom in results_dom: - url = extract_text(result_dom.xpath(url_xpath)) - title = extract_text(result_dom.xpath(title_xpath)) - content = extract_text(result_dom.xpath(content_xpath)) - price = extract_text(result_dom.xpath(price_xpath)) - shipping = extract_text(result_dom.xpath(shipping_xpath)) - source_country = extract_text(result_dom.xpath(source_country_xpath)) - thumbnail = extract_text(result_dom.xpath(thumbnail_xpath)) - - if title == "": - continue - - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'price': price, - 'shipping': shipping, - 'source_country': source_country, - 'thumbnail': thumbnail, - 'template': 'products.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/elasticsearch.py b/apps/searxng/searx/engines/elasticsearch.py deleted file mode 100755 index 7bddab1..0000000 --- a/apps/searxng/searx/engines/elasticsearch.py +++ /dev/null @@ -1,178 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""".. sidebar:: info - - - :origin:`elasticsearch.py ` - - `Elasticsearch `_ - - `Elasticsearch Guide - `_ - - `Install Elasticsearch - `_ - -Elasticsearch_ supports numerous ways to query the data it is storing. At the -moment the engine supports the most popular search methods (``query_type``): - -- ``match``, -- ``simple_query_string``, -- ``term`` and -- ``terms``. - -If none of the methods fit your use case, you can select ``custom`` query type -and provide the JSON payload to submit to Elasticsearch in -``custom_query_json``. - -Example -======= - -The following is an example configuration for an Elasticsearch_ instance with -authentication configured to read from ``my-index`` index. - -.. code:: yaml - - - name: elasticsearch - shortcut: es - engine: elasticsearch - base_url: http://localhost:9200 - username: elastic - password: changeme - index: my-index - query_type: match - # custom_query_json: '{ ... }' - enable_http: true - -""" - -from json import loads, dumps -from searx.exceptions import SearxEngineAPIException - - -base_url = 'http://localhost:9200' -username = '' -password = '' -index = '' -search_url = base_url + '/' + index + '/_search' -query_type = 'match' -custom_query_json = {} -show_metadata = False -categories = ['general'] - - -def init(engine_settings): - if 'query_type' in engine_settings and engine_settings['query_type'] not in _available_query_types: - raise ValueError('unsupported query type', engine_settings['query_type']) - - if index == '': - raise ValueError('index cannot be empty') - - -def request(query, params): - if query_type not in _available_query_types: - return params - - if username and password: - params['auth'] = (username, password) - - params['url'] = search_url - params['method'] = 'GET' - params['data'] = dumps(_available_query_types[query_type](query)) - params['headers']['Content-Type'] = 'application/json' - - return params - - -def _match_query(query): - """ - The standard for full text queries. - searx format: "key:value" e.g. city:berlin - REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html - """ - - try: - key, value = query.split(':') - except Exception as e: - raise ValueError('query format must be "key:value"') from e - - return {"query": {"match": {key: {'query': value}}}} - - -def _simple_query_string_query(query): - """ - Accepts query strings, but it is less strict than query_string - The field used can be specified in index.query.default_field in Elasticsearch. - REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html - """ - - return {'query': {'simple_query_string': {'query': query}}} - - -def _term_query(query): - """ - Accepts one term and the name of the field. - searx format: "key:value" e.g. city:berlin - REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html - """ - - try: - key, value = query.split(':') - except Exception as e: - raise ValueError('query format must be key:value') from e - - return {'query': {'term': {key: value}}} - - -def _terms_query(query): - """ - Accepts multiple terms and the name of the field. - searx format: "key:value1,value2" e.g. city:berlin,paris - REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html - """ - - try: - key, values = query.split(':') - except Exception as e: - raise ValueError('query format must be key:value1,value2') from e - - return {'query': {'terms': {key: values.split(',')}}} - - -def _custom_query(query): - key, value = query.split(':') - custom_query = custom_query_json - for query_key, query_value in custom_query.items(): - if query_key == '{{KEY}}': - custom_query[key] = custom_query.pop(query_key) - if query_value == '{{VALUE}}': - custom_query[query_key] = value - return custom_query - - -def response(resp): - results = [] - - resp_json = loads(resp.text) - if 'error' in resp_json: - raise SearxEngineAPIException(resp_json['error']) - - for result in resp_json['hits']['hits']: - r = {key: str(value) if not key.startswith('_') else value for key, value in result['_source'].items()} - r['template'] = 'key-value.html' - - if show_metadata: - r['metadata'] = {'index': result['_index'], 'id': result['_id'], 'score': result['_score']} - - results.append(r) - - return results - - -_available_query_types = { - # Full text queries - # https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html - 'match': _match_query, - 'simple_query_string': _simple_query_string_query, - # Term-level queries - # https://www.elastic.co/guide/en/elasticsearch/reference/current/term-level-queries.html - 'term': _term_query, - 'terms': _terms_query, - # Query JSON defined by the instance administrator. - 'custom': _custom_query, -} diff --git a/apps/searxng/searx/engines/emojipedia.py b/apps/searxng/searx/engines/emojipedia.py deleted file mode 100755 index 020bf68..0000000 --- a/apps/searxng/searx/engines/emojipedia.py +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Emojipedia - -Emojipedia is an emoji reference website which documents the meaning and -common usage of emoji characters in the Unicode Standard. It is owned by Zedge -since 2021. Emojipedia is a voting member of The Unicode Consortium.[1] - -[1] https://en.wikipedia.org/wiki/Emojipedia -""" - -from urllib.parse import urlencode -from lxml import html - -from searx.utils import ( - eval_xpath_list, - eval_xpath_getindex, - extract_text, -) - -about = { - "website": 'https://emojipedia.org', - "wikidata_id": 'Q22908129', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -categories = [] -paging = False -time_range_support = False - -base_url = 'https://emojipedia.org' -search_url = base_url + '/search/?{query}' - - -def request(query, params): - params['url'] = search_url.format( - query=urlencode({'q': query}), - ) - return params - - -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - for result in eval_xpath_list(dom, "//ol[@class='search-results']/li"): - - extracted_desc = extract_text(eval_xpath_getindex(result, './/p', 0)) - - if 'No results found.' in extracted_desc: - break - - link = eval_xpath_getindex(result, './/h2/a', 0) - - url = base_url + link.attrib.get('href') - title = extract_text(link) - content = extracted_desc - - res = {'url': url, 'title': title, 'content': content} - - results.append(res) - - return results diff --git a/apps/searxng/searx/engines/fdroid.py b/apps/searxng/searx/engines/fdroid.py deleted file mode 100755 index b5f004e..0000000 --- a/apps/searxng/searx/engines/fdroid.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - F-Droid (a repository of FOSS applications for Android) -""" - -from urllib.parse import urlencode -from lxml import html -from searx.utils import extract_text - -# about -about = { - "website": 'https://f-droid.org/', - "wikidata_id": 'Q1386210', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['files', 'apps'] -paging = True - -# search-url -base_url = 'https://search.f-droid.org/' -search_url = base_url + '?{query}' - - -# do search-request -def request(query, params): - query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''}) - params['url'] = search_url.format(query=query) - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - for app in dom.xpath('//a[@class="package-header"]'): - app_url = app.xpath('./@href')[0] - app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()')) - app_content = ( - extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() - + ' - ' - + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip() - ) - app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0] - - results.append({'url': app_url, 'title': app_title, 'content': app_content, 'img_src': app_img_src}) - - return results diff --git a/apps/searxng/searx/engines/flickr.py b/apps/searxng/searx/engines/flickr.py deleted file mode 100755 index b7cd768..0000000 --- a/apps/searxng/searx/engines/flickr.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Flickr (Images) - - More info on api-key : https://www.flickr.com/services/apps/create/ -""" - -from json import loads -from urllib.parse import urlencode - -# about -about = { - "website": 'https://www.flickr.com', - "wikidata_id": 'Q103204', - "official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html', - "use_official_api": True, - "require_api_key": True, - "results": 'JSON', -} - -categories = ['images'] - -nb_per_page = 15 -paging = True -api_key = None - - -url = ( - 'https://api.flickr.com/services/rest/?method=flickr.photos.search' - + '&api_key={api_key}&{text}&sort=relevance' - + '&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z' - + '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}' -) -photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' - -paging = True - - -def build_flickr_url(user_id, photo_id): - return photo_url.format(userid=user_id, photoid=photo_id) - - -def request(query, params): - params['url'] = url.format( - text=urlencode({'text': query}), api_key=api_key, nb_per_page=nb_per_page, page=params['pageno'] - ) - return params - - -def response(resp): - results = [] - - search_results = loads(resp.text) - - # return empty array if there are no results - if 'photos' not in search_results: - return [] - - if 'photo' not in search_results['photos']: - return [] - - photos = search_results['photos']['photo'] - - # parse results - for photo in photos: - if 'url_o' in photo: - img_src = photo['url_o'] - elif 'url_z' in photo: - img_src = photo['url_z'] - else: - continue - - # For a bigger thumbnail, keep only the url_z, not the url_n - if 'url_n' in photo: - thumbnail_src = photo['url_n'] - elif 'url_z' in photo: - thumbnail_src = photo['url_z'] - else: - thumbnail_src = img_src - - url = build_flickr_url(photo['owner'], photo['id']) - - # append result - results.append( - { - 'url': url, - 'title': photo['title'], - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'content': photo['description']['_content'], - 'author': photo['ownername'], - 'template': 'images.html', - } - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/flickr_noapi.py b/apps/searxng/searx/engines/flickr_noapi.py deleted file mode 100755 index 5299c60..0000000 --- a/apps/searxng/searx/engines/flickr_noapi.py +++ /dev/null @@ -1,143 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Flickr (Images) - -""" - -from typing import TYPE_CHECKING - -import json -from time import time -import re -from urllib.parse import urlencode -from searx.utils import ecma_unescape, html_to_text - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -# about -about = { - "website": 'https://www.flickr.com', - "wikidata_id": 'Q103204', - "official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['images'] -paging = True -time_range_support = True -safesearch = False - -time_range_dict = { - 'day': 60 * 60 * 24, - 'week': 60 * 60 * 24 * 7, - 'month': 60 * 60 * 24 * 7 * 4, - 'year': 60 * 60 * 24 * 7 * 52, -} -image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'm', 'n', 't', 'q', 's') - -search_url = 'https://www.flickr.com/search?{query}&page={page}' -time_range_url = '&min_upload_date={start}&max_upload_date={end}' -photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' -modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M) - - -def build_flickr_url(user_id, photo_id): - return photo_url.format(userid=user_id, photoid=photo_id) - - -def _get_time_range_url(time_range): - if time_range in time_range_dict: - return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range])) - return '' - - -def request(query, params): - params['url'] = search_url.format(query=urlencode({'text': query}), page=params['pageno']) + _get_time_range_url( - params['time_range'] - ) - return params - - -def response(resp): # pylint: disable=too-many-branches - results = [] - - matches = modelexport_re.search(resp.text) - if matches is None: - return results - - match = matches.group(1) - model_export = json.loads(match) - - if 'legend' not in model_export: - return results - legend = model_export['legend'] - - # handle empty page - if not legend or not legend[0]: - return results - - for x, index in enumerate(legend): - if len(index) != 8: - logger.debug("skip legend enty %s : %s", x, index) - continue - - photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][index[4]][index[5]][int(index[6])][ - index[7] - ] - author = ecma_unescape(photo.get('realname', '')) - source = ecma_unescape(photo.get('username', '')) - if source: - source += ' @ Flickr' - title = ecma_unescape(photo.get('title', '')) - content = html_to_text(ecma_unescape(photo.get('description', ''))) - img_src = None - - # From the biggest to the lowest format - size_data = None - for image_size in image_sizes: - if image_size in photo['sizes']['data']: - size_data = photo['sizes']['data'][image_size]['data'] - break - - if not size_data: - logger.debug('cannot find valid image size: {0}'.format(repr(photo['sizes']['data']))) - continue - - img_src = size_data['url'] - img_format = f"{size_data['width']} x {size_data['height']}" - - # For a bigger thumbnail, keep only the url_z, not the url_n - if 'n' in photo['sizes']['data']: - thumbnail_src = photo['sizes']['data']['n']['data']['url'] - elif 'z' in photo['sizes']['data']: - thumbnail_src = photo['sizes']['data']['z']['data']['url'] - else: - thumbnail_src = img_src - - if 'ownerNsid' not in photo: - # should not happen, disowned photo? Show it anyway - url = img_src - else: - url = build_flickr_url(photo['ownerNsid'], photo['id']) - - result = { - 'url': url, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'source': source, - 'img_format': img_format, - 'template': 'images.html', - } - result['author'] = author.encode(errors='ignore').decode() - result['source'] = source.encode(errors='ignore').decode() - result['title'] = title.encode(errors='ignore').decode() - result['content'] = content.encode(errors='ignore').decode() - results.append(result) - - return results diff --git a/apps/searxng/searx/engines/framalibre.py b/apps/searxng/searx/engines/framalibre.py deleted file mode 100755 index b2c9d90..0000000 --- a/apps/searxng/searx/engines/framalibre.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - FramaLibre (It) -""" - -from html import escape -from urllib.parse import urljoin, urlencode -from lxml import html -from searx.utils import extract_text - -# about -about = { - "website": 'https://framalibre.org/', - "wikidata_id": 'Q30213882', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['it'] -paging = True - -# search-url -base_url = 'https://framalibre.org/' -search_url = base_url + 'recherche-par-crit-res?{query}&page={offset}' - -# specific xpath variables -results_xpath = '//div[@class="nodes-list-row"]/div[contains(@typeof,"sioc:Item")]' -link_xpath = './/h3[@class="node-title"]/a[@href]' -thumbnail_xpath = './/img[@class="media-object img-responsive"]/@src' -content_xpath = './/div[@class="content"]//p' - - -# do search-request -def request(query, params): - offset = params['pageno'] - 1 - params['url'] = search_url.format(query=urlencode({'keys': query}), offset=offset) - - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - # parse results - for result in dom.xpath(results_xpath): - link = result.xpath(link_xpath)[0] - href = urljoin(base_url, link.attrib.get('href')) - # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this... - title = escape(extract_text(link)) - thumbnail_tags = result.xpath(thumbnail_xpath) - thumbnail = None - if len(thumbnail_tags) > 0: - thumbnail = extract_text(thumbnail_tags[0]) - if thumbnail[0] == '/': - thumbnail = base_url + thumbnail - content = escape(extract_text(result.xpath(content_xpath))) - - # append result - results.append({'url': href, 'title': title, 'img_src': thumbnail, 'content': content}) - - # return results - return results diff --git a/apps/searxng/searx/engines/freesound.py b/apps/searxng/searx/engines/freesound.py deleted file mode 100755 index ea66666..0000000 --- a/apps/searxng/searx/engines/freesound.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" -Freesound (Sound) -""" - -from json import loads -from urllib.parse import urlencode -from datetime import datetime - -disabled = True -api_key = "" - -# about -about = { - "website": "https://freesound.org", - "wikidata_id": "Q835703", - "official_api_documentation": "https://freesound.org/docs/api", - "use_official_api": True, - "require_api_key": True, - "results": "JSON", -} - -# engine dependent config -paging = True - -# search url -url = "https://freesound.org/apiv2/" -search_url = ( - url + "search/text/?query={query}&page={page}&fields=name,url,download,created,description,type&token={api_key}" -) - -# search request -def request(query, params): - params["url"] = search_url.format( - query=urlencode({"q": query}), - page=params["pageno"], - api_key=api_key, - ) - return params - - -# get response from search request -def response(resp): - results = [] - search_res = loads(resp.text) - # parse results - for result in search_res.get("results", []): - title = result["name"] - content = result["description"][:128] - publishedDate = datetime.fromisoformat(result["created"]) - uri = result["download"] - - # append result - results.append( - { - "url": result["url"], - "title": title, - "publishedDate": publishedDate, - "audio_src": uri, - "content": content, - } - ) - - return results diff --git a/apps/searxng/searx/engines/frinkiac.py b/apps/searxng/searx/engines/frinkiac.py deleted file mode 100755 index 95a1366..0000000 --- a/apps/searxng/searx/engines/frinkiac.py +++ /dev/null @@ -1,51 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Frinkiac (Images) -""" - -from json import loads -from urllib.parse import urlencode - -# about -about = { - "website": 'https://frinkiac.com', - "wikidata_id": 'Q24882614', - "official_api_documentation": {'url': None, 'comment': 'see https://github.com/MitchellAW/CompuGlobal'}, - "use_official_api": False, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['images'] - -BASE = 'https://frinkiac.com/' -SEARCH_URL = '{base}api/search?{query}' -RESULT_URL = '{base}?{query}' -THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg' -IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg' - - -def request(query, params): - params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query})) - return params - - -def response(resp): - results = [] - response_data = loads(resp.text) - for result in response_data: - episode = result['Episode'] - timestamp = result['Timestamp'] - - results.append( - { - 'template': 'images.html', - 'url': RESULT_URL.format(base=BASE, query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})), - 'title': episode, - 'content': '', - 'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp), - 'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp), - } - ) - - return results diff --git a/apps/searxng/searx/engines/genius.py b/apps/searxng/searx/engines/genius.py deleted file mode 100755 index db1f666..0000000 --- a/apps/searxng/searx/engines/genius.py +++ /dev/null @@ -1,103 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=invalid-name -"""Genius - -""" - -from urllib.parse import urlencode -from datetime import datetime - -# about -about = { - "website": 'https://genius.com/', - "wikidata_id": 'Q3419343', - "official_api_documentation": 'https://docs.genius.com/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['music', 'lyrics'] -paging = True -page_size = 5 - -url = 'https://genius.com/api/' -search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}' -music_player = 'https://genius.com{api_path}/apple_music_player' - - -def request(query, params): - params['url'] = search_url.format( - query=urlencode({'q': query}), - index='multi', - page_size=page_size, - pageno=params['pageno'], - ) - return params - - -def parse_lyric(hit): - content = '' - highlights = hit['highlights'] - if highlights: - content = hit['highlights'][0]['value'] - else: - content = hit['result'].get('title_with_featured', '') - - timestamp = hit['result']['lyrics_updated_at'] - result = { - 'url': hit['result']['url'], - 'title': hit['result']['full_title'], - 'content': content, - 'img_src': hit['result']['song_art_image_thumbnail_url'], - } - if timestamp: - result.update({'publishedDate': datetime.fromtimestamp(timestamp)}) - api_path = hit['result'].get('api_path') - if api_path: - # The players are just playing 30sec from the title. Some of the player - # will be blocked because of a cross-origin request and some players will - # link to apple when you press the play button. - result['iframe_src'] = music_player.format(api_path=api_path) - return result - - -def parse_artist(hit): - result = { - 'url': hit['result']['url'], - 'title': hit['result']['name'], - 'content': '', - 'img_src': hit['result']['image_url'], - } - return result - - -def parse_album(hit): - res = hit['result'] - content = res.get('name_with_artist', res.get('name', '')) - x = res.get('release_date_components') - if x: - x = x.get('year') - if x: - content = "%s / %s" % (x, content) - return { - 'url': res['url'], - 'title': res['full_title'], - 'img_src': res['cover_art_url'], - 'content': content.strip(), - } - - -parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album} - - -def response(resp): - results = [] - for section in resp.json()['response']['sections']: - for hit in section['hits']: - func = parse.get(hit['type']) - if func: - results.append(func(hit)) - return results diff --git a/apps/searxng/searx/engines/gentoo.py b/apps/searxng/searx/engines/gentoo.py deleted file mode 100755 index f0cb6a7..0000000 --- a/apps/searxng/searx/engines/gentoo.py +++ /dev/null @@ -1,124 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Gentoo Wiki -""" - -from urllib.parse import urlencode, urljoin -from lxml import html -from searx.utils import extract_text - -# about -about = { - "website": 'https://wiki.gentoo.org/', - "wikidata_id": 'Q1050637', - "official_api_documentation": 'https://wiki.gentoo.org/api.php', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['it', 'software wikis'] -paging = True -base_url = 'https://wiki.gentoo.org' - -# xpath queries -xpath_results = '//ul[@class="mw-search-results"]/li' -xpath_link = './/div[@class="mw-search-result-heading"]/a' -xpath_content = './/div[@class="searchresult"]' - - -# cut 'en' from 'en-US', 'de' from 'de-CH', and so on -def locale_to_lang_code(locale): - if locale.find('-') >= 0: - locale = locale.split('-')[0] - return locale - - -# wikis for some languages were moved off from the main site, we need to make -# requests to correct URLs to be able to get results in those languages -lang_urls = { - 'en': {'base': 'https://wiki.gentoo.org', 'search': '/index.php?title=Special:Search&offset={offset}&{query}'}, - 'others': { - 'base': 'https://wiki.gentoo.org', - 'search': '/index.php?title=Special:Search&offset={offset}&{query}\ - &profile=translation&languagefilter={language}', - }, -} - - -# get base & search URLs for selected language -def get_lang_urls(language): - if language != 'en': - return lang_urls['others'] - return lang_urls['en'] - - -# Language names to build search requests for -# those languages which are hosted on the main site. -main_langs = { - 'ar': 'العربية', - 'bg': 'Български', - 'cs': 'Česky', - 'da': 'Dansk', - 'el': 'Ελληνικά', - 'es': 'Español', - 'he': 'עברית', - 'hr': 'Hrvatski', - 'hu': 'Magyar', - 'it': 'Italiano', - 'ko': '한국어', - 'lt': 'Lietuviškai', - 'nl': 'Nederlands', - 'pl': 'Polski', - 'pt': 'Português', - 'ru': 'Русский', - 'sl': 'Slovenský', - 'th': 'ไทย', - 'uk': 'Українська', - 'zh': '简体中文', -} - -# do search-request -def request(query, params): - # translate the locale (e.g. 'en-US') to language code ('en') - language = locale_to_lang_code(params['language']) - - # if our language is hosted on the main site, we need to add its name - # to the query in order to narrow the results to that language - if language in main_langs: - query += ' (' + main_langs[language] + ')' - - # prepare the request parameters - query = urlencode({'search': query}) - offset = (params['pageno'] - 1) * 20 - - # get request URLs for our language of choice - urls = get_lang_urls(language) - search_url = urls['base'] + urls['search'] - - params['url'] = search_url.format(query=query, offset=offset, language=language) - - return params - - -# get response from search-request -def response(resp): - # get the base URL for the language in which request was made - language = locale_to_lang_code(resp.search_params['language']) - base_url = get_lang_urls(language)['base'] - - results = [] - - dom = html.fromstring(resp.text) - - # parse results - for result in dom.xpath(xpath_results): - link = result.xpath(xpath_link)[0] - href = urljoin(base_url, link.attrib.get('href')) - title = extract_text(link) - content = extract_text(result.xpath(xpath_content)) - - results.append({'url': href, 'title': title, 'content': content}) - - return results diff --git a/apps/searxng/searx/engines/github.py b/apps/searxng/searx/engines/github.py deleted file mode 100755 index 3180418..0000000 --- a/apps/searxng/searx/engines/github.py +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Github (IT) -""" - -from json import loads -from urllib.parse import urlencode - -# about -about = { - "website": 'https://github.com/', - "wikidata_id": 'Q364', - "official_api_documentation": 'https://developer.github.com/v3/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['it', 'repos'] - -# search-url -search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa - -accept_header = 'application/vnd.github.preview.text-match+json' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query})) - - params['headers']['Accept'] = accept_header - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_res = loads(resp.text) - - # check if items are received - if 'items' not in search_res: - return [] - - # parse results - for res in search_res['items']: - title = res['name'] - url = res['html_url'] - - if res['description']: - content = res['description'][:500] - else: - content = '' - - # append result - results.append({'url': url, 'title': title, 'content': content}) - - # return results - return results diff --git a/apps/searxng/searx/engines/google.py b/apps/searxng/searx/engines/google.py deleted file mode 100755 index d06c055..0000000 --- a/apps/searxng/searx/engines/google.py +++ /dev/null @@ -1,493 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This is the implementation of the Google WEB engine. Some of this -implementations (manly the :py:obj:`get_google_info`) are shared by other -engines: - -- :ref:`google images engine` -- :ref:`google news engine` -- :ref:`google videos engine` -- :ref:`google scholar engine` -- :ref:`google autocomplete` - -""" - -from typing import TYPE_CHECKING - -import re -from urllib.parse import urlencode -from lxml import html -import babel -import babel.core -import babel.languages - -from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex -from searx.locales import language_tag, region_tag, get_offical_locales -from searx.network import get # see https://github.com/searxng/searxng/issues/762 -from searx.exceptions import SearxEngineCaptchaException -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - - -# about -about = { - "website": 'https://www.google.com', - "wikidata_id": 'Q9366', - "official_api_documentation": 'https://developers.google.com/custom-search/', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['general', 'web'] -paging = True -time_range_support = True -safesearch = True - -time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} - -# Filter results. 0: None, 1: Moderate, 2: Strict -filter_mapping = {0: 'off', 1: 'medium', 2: 'high'} - -# specific xpath variables -# ------------------------ - -results_xpath = './/div[contains(@jscontroller, "SC7lYd")]' -title_xpath = './/a/h3[1]' -href_xpath = './/a[h3]/@href' -content_xpath = './/div[@data-sncf]' - -# Suggestions are links placed in a *card-section*, we extract only the text -# from the links not the links itself. -suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' - -# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for -# # celebrities like '!google natasha allegri' -# # or '!google chris evans' -UI_ASYNC = 'use_ac:true,_fmt:prog' -"""Format of the response from UI's async request.""" - - -def get_google_info(params, eng_traits): - """Composing various (language) properties for the google engines (:ref:`google - API`). - - This function is called by the various google engines (:ref:`google web - engine`, :ref:`google images engine`, :ref:`google news engine` and - :ref:`google videos engine`). - - :param dict param: Request parameters of the engine. At least - a ``searxng_locale`` key should be in the dictionary. - - :param eng_traits: Engine's traits fetched from google preferences - (:py:obj:`searx.enginelib.traits.EngineTraits`) - - :rtype: dict - :returns: - Py-Dictionary with the key/value pairs: - - language: - The language code that is used by google (e.g. ``lang_en`` or - ``lang_zh-TW``) - - country: - The country code that is used by google (e.g. ``US`` or ``TW``) - - locale: - A instance of :py:obj:`babel.core.Locale` build from the - ``searxng_locale`` value. - - subdomain: - Google subdomain :py:obj:`google_domains` that fits to the country - code. - - params: - Py-Dictionary with additional request arguments (can be passed to - :py:func:`urllib.parse.urlencode`). - - - ``hl`` parameter: specifies the interface language of user interface. - - ``lr`` parameter: restricts search results to documents written in - a particular language. - - ``cr`` parameter: restricts search results to documents - originating in a particular country. - - ``ie`` parameter: sets the character encoding scheme that should - be used to interpret the query string ('utf8'). - - ``oe`` parameter: sets the character encoding scheme that should - be used to decode the XML result ('utf8'). - - headers: - Py-Dictionary with additional HTTP headers (can be passed to - request's headers) - - - ``Accept: '*/*`` - - """ - - ret_val = { - 'language': None, - 'country': None, - 'subdomain': None, - 'params': {}, - 'headers': {}, - 'cookies': {}, - 'locale': None, - } - - sxng_locale = params.get('searxng_locale', 'all') - try: - locale = babel.Locale.parse(sxng_locale, sep='-') - except babel.core.UnknownLocaleError: - locale = None - - eng_lang = eng_traits.get_language(sxng_locale, 'lang_en') - lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en - country = eng_traits.get_region(sxng_locale, eng_traits.all_locale) - - # Test zh_hans & zh_hant --> in the topmost links in the result list of list - # TW and HK you should a find wiktionary.org zh_hant link. In the result - # list of zh-CN should not be no hant link instead you should find - # zh.m.wikipedia.org/zh somewhere in the top. - - # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5 - # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5 - - ret_val['language'] = eng_lang - ret_val['country'] = country - ret_val['locale'] = locale - ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com') - - # hl parameter: - # The hl parameter specifies the interface language (host language) of - # your user interface. To improve the performance and the quality of your - # search results, you are strongly encouraged to set this parameter - # explicitly. - # https://developers.google.com/custom-search/docs/xml_results#hlsp - # The Interface Language: - # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages - - # https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817 - ret_val['params']['hl'] = f'{lang_code}-{country}' - - # lr parameter: - # The lr (language restrict) parameter restricts search results to - # documents written in a particular language. - # https://developers.google.com/custom-search/docs/xml_results#lrsp - # Language Collection Values: - # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections - # - # To select 'all' languages an empty 'lr' value is used. - # - # Different to other google services, Google Schloar supports to select more - # than one language. The languages are seperated by a pipe '|' (logical OR). - # By example: &lr=lang_zh-TW%7Clang_de selects articles written in - # traditional chinese OR german language. - - ret_val['params']['lr'] = eng_lang - if sxng_locale == 'all': - ret_val['params']['lr'] = '' - - # cr parameter: - # The cr parameter restricts search results to documents originating in a - # particular country. - # https://developers.google.com/custom-search/docs/xml_results#crsp - - ret_val['params']['cr'] = 'country' + country - if sxng_locale == 'all': - ret_val['params']['cr'] = '' - - # gl parameter: (mandatory by Geeogle News) - # The gl parameter value is a two-letter country code. For WebSearch - # results, the gl parameter boosts search results whose country of origin - # matches the parameter value. See the Country Codes section for a list of - # valid values. - # Specifying a gl parameter value in WebSearch requests should improve the - # relevance of results. This is particularly true for international - # customers and, even more specifically, for customers in English-speaking - # countries other than the United States. - # https://developers.google.com/custom-search/docs/xml_results#glsp - - # https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635 - # ret_val['params']['gl'] = country - - # ie parameter: - # The ie parameter sets the character encoding scheme that should be used - # to interpret the query string. The default ie value is latin1. - # https://developers.google.com/custom-search/docs/xml_results#iesp - - ret_val['params']['ie'] = 'utf8' - - # oe parameter: - # The oe parameter sets the character encoding scheme that should be used - # to decode the XML result. The default oe value is latin1. - # https://developers.google.com/custom-search/docs/xml_results#oesp - - ret_val['params']['oe'] = 'utf8' - - # num parameter: - # The num parameter identifies the number of search results to return. - # The default num value is 10, and the maximum value is 20. If you request - # more than 20 results, only 20 results will be returned. - # https://developers.google.com/custom-search/docs/xml_results#numsp - - # HINT: seems to have no effect (tested in google WEB & Images) - # ret_val['params']['num'] = 20 - - # HTTP headers - - ret_val['headers']['Accept'] = '*/*' - - # Cookies - - # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746 - # - https://github.com/searxng/searxng/issues/1555 - ret_val['cookies']['CONSENT'] = "YES+" - - return ret_val - - -def detect_google_sorry(resp): - if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'): - raise SearxEngineCaptchaException() - - -def request(query, params): - """Google search request""" - # pylint: disable=line-too-long - offset = (params['pageno'] - 1) * 10 - google_info = get_google_info(params, traits) - - # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium - query_url = ( - 'https://' - + google_info['subdomain'] - + '/search' - + "?" - + urlencode( - { - 'q': query, - **google_info['params'], - 'filter': '0', - 'start': offset, - # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i', - # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG', - # 'cs' : 1, - # 'sa': 'N', - # 'yv': 3, - # 'prmd': 'vin', - # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE', - # 'sa': 'N', - # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg' - # formally known as use_mobile_ui - 'asearch': 'arc', - 'async': UI_ASYNC, - } - ) - ) - - if params['time_range'] in time_range_dict: - query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) - if params['safesearch']: - query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) - params['url'] = query_url - - params['cookies'] = google_info['cookies'] - params['headers'].update(google_info['headers']) - return params - - -# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA -# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26; -RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);') - - -def _parse_data_images(dom): - data_image_map = {} - for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()): - end_pos = data_image.rfind('=') - if end_pos > 0: - data_image = data_image[: end_pos + 1] - data_image_map[img_id] = data_image - logger.debug('data:image objects --> %s', list(data_image_map.keys())) - return data_image_map - - -def response(resp): - """Get response from google's search request""" - # pylint: disable=too-many-branches, too-many-statements - detect_google_sorry(resp) - - results = [] - - # convert the text to dom - dom = html.fromstring(resp.text) - data_image_map = _parse_data_images(dom) - - # results --> answer - answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') - if answer_list: - answer_list = [_.xpath("normalize-space()") for _ in answer_list] - results.append({'answer': ' '.join(answer_list)}) - else: - logger.debug("did not find 'answer'") - - # parse results - - for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks - - try: - title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) - if title_tag is None: - # this not one of the common google results *section* - logger.debug('ignoring item from the result_xpath list: missing title') - continue - title = extract_text(title_tag) - - url = eval_xpath_getindex(result, href_xpath, 0, None) - if url is None: - logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title) - continue - - content_nodes = eval_xpath(result, content_xpath) - content = extract_text(content_nodes) - - if not content: - logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title) - continue - - img_src = content_nodes[0].xpath('.//img/@src') - if img_src: - img_src = img_src[0] - if img_src.startswith('data:image'): - img_id = content_nodes[0].xpath('.//img/@id') - if img_id: - img_src = data_image_map.get(img_id[0]) - else: - img_src = None - - results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src}) - - except Exception as e: # pylint: disable=broad-except - logger.error(e, exc_info=True) - continue - - # parse suggestion - for suggestion in eval_xpath_list(dom, suggestion_xpath): - # append suggestion - results.append({'suggestion': extract_text(suggestion)}) - - # return results - return results - - -# get supported languages from their site - - -skip_countries = [ - # official language of google-country not in google-languages - 'AL', # Albanien (sq) - 'AZ', # Aserbaidschan (az) - 'BD', # Bangladesch (bn) - 'BN', # Brunei Darussalam (ms) - 'BT', # Bhutan (dz) - 'ET', # Äthiopien (am) - 'GE', # Georgien (ka, os) - 'GL', # Grönland (kl) - 'KH', # Kambodscha (km) - 'LA', # Laos (lo) - 'LK', # Sri Lanka (si, ta) - 'ME', # Montenegro (sr) - 'MK', # Nordmazedonien (mk, sq) - 'MM', # Myanmar (my) - 'MN', # Mongolei (mn) - 'MV', # Malediven (dv) // dv_MV is unknown by babel - 'MY', # Malaysia (ms) - 'NP', # Nepal (ne) - 'TJ', # Tadschikistan (tg) - 'TM', # Turkmenistan (tk) - 'UZ', # Usbekistan (uz) -] - - -def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True): - """Fetch languages from Google.""" - # pylint: disable=import-outside-toplevel, too-many-branches - - engine_traits.custom['supported_domains'] = {} - - resp = get('https://www.google.com/preferences') - if not resp.ok: # type: ignore - raise RuntimeError("Response from Google's preferences is not OK.") - - dom = html.fromstring(resp.text) # type: ignore - - # supported language codes - - lang_map = {'no': 'nb'} - for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'): - - eng_lang = x.get("value").split('_')[-1] - try: - locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') - except babel.UnknownLocaleError: - print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) - continue - sxng_lang = language_tag(locale) - - conflict = engine_traits.languages.get(sxng_lang) - if conflict: - if conflict != eng_lang: - print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) - continue - engine_traits.languages[sxng_lang] = 'lang_' + eng_lang - - # alias languages - engine_traits.languages['zh'] = 'lang_zh-CN' - - # supported region codes - - for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'): - eng_country = x.get("value") - - if eng_country in skip_countries: - continue - if eng_country == 'ZZ': - engine_traits.all_locale = 'ZZ' - continue - - sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True) - - if not sxng_locales: - print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country)) - continue - - for sxng_locale in sxng_locales: - engine_traits.regions[region_tag(sxng_locale)] = eng_country - - # alias regions - engine_traits.regions['zh-CN'] = 'HK' - - # supported domains - - if add_domains: - resp = get('https://www.google.com/supported_domains') - if not resp.ok: # type: ignore - raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.") - - for domain in resp.text.split(): # type: ignore - domain = domain.strip() - if not domain or domain in [ - '.google.com', - ]: - continue - region = domain.split('.')[-1].upper() - engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore - if region == 'HK': - # There is no google.cn, we use .com.hk for zh-CN - engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore diff --git a/apps/searxng/searx/engines/google_images.py b/apps/searxng/searx/engines/google_images.py deleted file mode 100755 index e6445b1..0000000 --- a/apps/searxng/searx/engines/google_images.py +++ /dev/null @@ -1,129 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This is the implementation of the Google Images engine using the internal -Google API used by the Google Go Android app. - -This internal API offer results in - -- JSON (``_fmt:json``) -- Protobuf_ (``_fmt:pb``) -- Protobuf_ compressed? (``_fmt:pc``) -- HTML (``_fmt:html``) -- Protobuf_ encoded in JSON (``_fmt:jspb``). - -.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers -""" - -from typing import TYPE_CHECKING - -from urllib.parse import urlencode -from json import loads - -from searx.engines.google import fetch_traits # pylint: disable=unused-import -from searx.engines.google import ( - get_google_info, - time_range_dict, - detect_google_sorry, -) - -if TYPE_CHECKING: - import logging - from searx.enginelib.traits import EngineTraits - - logger: logging.Logger - traits: EngineTraits - - -# about -about = { - "website": 'https://images.google.com', - "wikidata_id": 'Q521550', - "official_api_documentation": 'https://developers.google.com/custom-search', - "use_official_api": False, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['images', 'web'] -paging = True -time_range_support = True -safesearch = True -send_accept_language_header = True - -filter_mapping = {0: 'images', 1: 'active', 2: 'active'} - - -def request(query, params): - """Google-Image search request""" - - google_info = get_google_info(params, traits) - - query_url = ( - 'https://' - + google_info['subdomain'] - + '/search' - + "?" - + urlencode( - { - 'q': query, - 'tbm': "isch", - **google_info['params'], - 'asearch': 'isch', - 'async': '_fmt:json,p:1,ijn:' + str(params['pageno']), - } - ) - ) - - if params['time_range'] in time_range_dict: - query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) - if params['safesearch']: - query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) - params['url'] = query_url - - params['cookies'] = google_info['cookies'] - params['headers'].update(google_info['headers']) - return params - - -def response(resp): - """Get response from google's search request""" - results = [] - - detect_google_sorry(resp) - - json_start = resp.text.find('{"ischj":') - json_data = loads(resp.text[json_start:]) - - for item in json_data["ischj"]["metadata"]: - - result_item = { - 'url': item["result"]["referrer_url"], - 'title': item["result"]["page_title"], - 'content': item["text_in_grid"]["snippet"], - 'source': item["result"]["site_title"], - 'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}', - 'img_src': item["original_image"]["url"], - 'thumbnail_src': item["thumbnail"]["url"], - 'template': 'images.html', - } - - author = item["result"].get('iptc', {}).get('creator') - if author: - result_item['author'] = ', '.join(author) - - copyright_notice = item["result"].get('iptc', {}).get('copyright_notice') - if copyright_notice: - result_item['source'] += ' | ' + copyright_notice - - freshness_date = item["result"].get("freshness_date") - if freshness_date: - result_item['source'] += ' | ' + freshness_date - - file_size = item.get('gsa', {}).get('file_size') - if file_size: - result_item['source'] += ' (%s)' % file_size - - results.append(result_item) - - return results diff --git a/apps/searxng/searx/engines/google_news.py b/apps/searxng/searx/engines/google_news.py deleted file mode 100755 index 4b1bffa..0000000 --- a/apps/searxng/searx/engines/google_news.py +++ /dev/null @@ -1,305 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This is the implementation of the Google News engine. - -Google News has a different region handling compared to Google WEB. - -- the ``ceid`` argument has to be set (:py:obj:`ceid_list`) -- the hl_ argument has to be set correctly (and different to Google WEB) -- the gl_ argument is mandatory - -If one of this argument is not set correctly, the request is redirected to -CONSENT dialog:: - - https://consent.google.com/m?continue= - -The google news API ignores some parameters from the common :ref:`google API`: - -- num_ : the number of search results is ignored / there is no paging all - results for a query term are in the first response. -- save_ : is ignored / Google-News results are always *SafeSearch* - -.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp -.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp -.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp -.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp -""" - -from typing import TYPE_CHECKING - -from urllib.parse import urlencode -import base64 -from lxml import html -import babel - -from searx import locales -from searx.utils import ( - eval_xpath, - eval_xpath_list, - eval_xpath_getindex, - extract_text, -) - -from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import -from searx.engines.google import ( - get_google_info, - detect_google_sorry, -) -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -# about -about = { - "website": 'https://news.google.com', - "wikidata_id": 'Q12020', - "official_api_documentation": 'https://developers.google.com/custom-search', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['news'] -paging = False -time_range_support = False - -# Google-News results are always *SafeSearch*. Option 'safesearch' is set to -# False here, otherwise checker will report safesearch-errors:: -# -# safesearch : results are identitical for safesearch=0 and safesearch=2 -safesearch = True -# send_accept_language_header = True - - -def request(query, params): - """Google-News search request""" - - sxng_locale = params.get('searxng_locale', 'en-US') - ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en') - google_info = get_google_info(params, traits) - google_info['subdomain'] = 'news.google.com' # google news has only one domain - - ceid_region, ceid_lang = ceid.split(':') - ceid_lang, ceid_suffix = ( - ceid_lang.split('-') - + [ - None, - ] - )[:2] - - google_info['params']['hl'] = ceid_lang - - if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']: - - if ceid_region.lower() == ceid_lang: - google_info['params']['hl'] = ceid_lang + '-' + ceid_region - else: - google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix - - elif ceid_region.lower() != ceid_lang: - - if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']: - google_info['params']['hl'] = ceid_lang - else: - google_info['params']['hl'] = ceid_lang + '-' + ceid_region - - google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0] - google_info['params']['gl'] = ceid_region - - query_url = ( - 'https://' - + google_info['subdomain'] - + "/search?" - + urlencode( - { - 'q': query, - **google_info['params'], - } - ) - # ceid includes a ':' character which must not be urlencoded - + ('&ceid=%s' % ceid) - ) - - params['url'] = query_url - params['cookies'] = google_info['cookies'] - params['headers'].update(google_info['headers']) - return params - - -def response(resp): - """Get response from google's search request""" - results = [] - detect_google_sorry(resp) - - # convert the text to dom - dom = html.fromstring(resp.text) - - for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): - - # The first
tag in the
contains the link to the article - # The href attribute of the tag is a google internal link, we have - # to decode - - href = eval_xpath_getindex(result, './article/a/@href', 0) - href = href.split('?')[0] - href = href.split('/')[-1] - href = base64.urlsafe_b64decode(href + '====') - href = href[href.index(b'http') :].split(b'\xd2')[0] - href = href.decode() - - title = extract_text(eval_xpath(result, './article/h3[1]')) - - # The pub_date is mostly a string like 'yesertday', not a real - # timezone date or time. Therefore we can't use publishedDate. - pub_date = extract_text(eval_xpath(result, './article//time')) - pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]')) - - content = ' / '.join([x for x in [pub_origin, pub_date] if x]) - - # The image URL is located in a preceding sibling tag, e.g.: - # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100" - # These URL are long but not personalized (double checked via tor). - - img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src')) - - results.append( - { - 'url': href, - 'title': title, - 'content': content, - 'img_src': img_src, - } - ) - - # return results - return results - - -ceid_list = [ - 'AE:ar', - 'AR:es-419', - 'AT:de', - 'AU:en', - 'BD:bn', - 'BE:fr', - 'BE:nl', - 'BG:bg', - 'BR:pt-419', - 'BW:en', - 'CA:en', - 'CA:fr', - 'CH:de', - 'CH:fr', - 'CL:es-419', - 'CN:zh-Hans', - 'CO:es-419', - 'CU:es-419', - 'CZ:cs', - 'DE:de', - 'EG:ar', - 'ES:es', - 'ET:en', - 'FR:fr', - 'GB:en', - 'GH:en', - 'GR:el', - 'HK:zh-Hant', - 'HU:hu', - 'ID:en', - 'ID:id', - 'IE:en', - 'IL:en', - 'IL:he', - 'IN:bn', - 'IN:en', - 'IN:hi', - 'IN:ml', - 'IN:mr', - 'IN:ta', - 'IN:te', - 'IT:it', - 'JP:ja', - 'KE:en', - 'KR:ko', - 'LB:ar', - 'LT:lt', - 'LV:en', - 'LV:lv', - 'MA:fr', - 'MX:es-419', - 'MY:en', - 'NA:en', - 'NG:en', - 'NL:nl', - 'NO:no', - 'NZ:en', - 'PE:es-419', - 'PH:en', - 'PK:en', - 'PL:pl', - 'PT:pt-150', - 'RO:ro', - 'RS:sr', - 'RU:ru', - 'SA:ar', - 'SE:sv', - 'SG:en', - 'SI:sl', - 'SK:sk', - 'SN:fr', - 'TH:th', - 'TR:tr', - 'TW:zh-Hant', - 'TZ:en', - 'UA:ru', - 'UA:uk', - 'UG:en', - 'US:en', - 'US:es-419', - 'VE:es-419', - 'VN:vi', - 'ZA:en', - 'ZW:en', -] -"""List of region/language combinations supported by Google News. Values of the -``ceid`` argument of the Google News REST API.""" - - -_skip_values = [ - 'ET:en', # english (ethiopia) - 'ID:en', # english (indonesia) - 'LV:en', # english (latvia) -] - -_ceid_locale_map = {'NO:no': 'nb-NO'} - - -def fetch_traits(engine_traits: EngineTraits): - _fetch_traits(engine_traits, add_domains=False) - - engine_traits.custom['ceid'] = {} - - for ceid in ceid_list: - if ceid in _skip_values: - continue - - region, lang = ceid.split(':') - x = lang.split('-') - if len(x) > 1: - if x[1] not in ['Hant', 'Hans']: - lang = x[0] - - sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region) - try: - locale = babel.Locale.parse(sxng_locale, sep='-') - except babel.UnknownLocaleError: - print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale)) - continue - - engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid diff --git a/apps/searxng/searx/engines/google_play.py b/apps/searxng/searx/engines/google_play.py deleted file mode 100755 index a9cfd1a..0000000 --- a/apps/searxng/searx/engines/google_play.py +++ /dev/null @@ -1,116 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Google Play Apps & Google Play Movies -""" - -from urllib.parse import urlencode -from lxml import html -from searx.utils import ( - eval_xpath, - extract_url, - extract_text, - eval_xpath_list, - eval_xpath_getindex, -) - -about = { - "website": "https://play.google.com/", - "wikidata_id": "Q79576", - "use_official_api": False, - "require_api_key": False, - "results": "HTML", -} - -send_accept_language_header = True - -play_categ = None # apps|movies -base_url = 'https://play.google.com' -search_url = base_url + "/store/search?{query}&c={play_categ}" - - -def request(query, params): - - if play_categ not in ('movies', 'apps'): - raise ValueError(f"unknown google play category: {play_categ}") - - params["url"] = search_url.format( - query=urlencode({"q": query}), - play_categ=play_categ, - ) - params['cookies']['CONSENT'] = "YES+" - - return params - - -def response(resp): - - if play_categ == 'movies': - return response_movies(resp) - if play_categ == 'apps': - return response_apps(resp) - - raise ValueError(f"Unsupported play category: {play_categ}") - - -def response_movies(resp): - - results = [] - dom = html.fromstring(resp.text) - - for section in eval_xpath(dom, '//c-wiz/section/header/..'): - sec_name = extract_text(eval_xpath(section, './header')) - for item in eval_xpath(section, './/a'): - url = base_url + item.get('href') - div_1, div_2 = eval_xpath(item, './div')[:2] - title = extract_text(eval_xpath(div_2, './div[@title]')) - metadata = extract_text(eval_xpath(div_2, './div[@class]')) - img = eval_xpath(div_1, './/img')[0] - img_src = img.get('src') - results.append( - { - "url": url, - "title": title, - "content": sec_name, - "img_src": img_src, - 'metadata': metadata, - 'template': 'videos.html', - } - ) - return results - - -def response_apps(resp): - - results = [] - dom = html.fromstring(resp.text) - - if eval_xpath(dom, '//div[@class="v6DsQb"]'): - return [] - - spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None) - if spot is not None: - url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url) - title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]')) - content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]')) - img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src')) - - results.append({"url": url, "title": title, "content": content, "img_src": img}) - - more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1) - for result in more: - url = extract_url(eval_xpath(result, ".//a/@href"), search_url) - title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]')) - content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]')) - img = extract_text( - eval_xpath( - result, - './/img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src', - ) - ) - - results.append({"url": url, "title": title, "content": content, "img_src": img}) - - for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'): - results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))}) - - return results diff --git a/apps/searxng/searx/engines/google_scholar.py b/apps/searxng/searx/engines/google_scholar.py deleted file mode 100755 index 6f33d1e..0000000 --- a/apps/searxng/searx/engines/google_scholar.py +++ /dev/null @@ -1,217 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This is the implementation of the Google Scholar engine. - -Compared to other Google services the Scholar engine has a simple GET REST-API -and there does not exists `async` API. Even though the API slightly vintage we -can make use of the :ref:`google API` to assemble the arguments of the GET -request. -""" - -from typing import TYPE_CHECKING -from typing import Optional - -from urllib.parse import urlencode -from datetime import datetime -from lxml import html - -from searx.utils import ( - eval_xpath, - eval_xpath_getindex, - eval_xpath_list, - extract_text, -) - -from searx.exceptions import SearxEngineCaptchaException - -from searx.engines.google import fetch_traits # pylint: disable=unused-import -from searx.engines.google import ( - get_google_info, - time_range_dict, -) -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -# about -about = { - "website": 'https://scholar.google.com', - "wikidata_id": 'Q494817', - "official_api_documentation": 'https://developers.google.com/custom-search', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['science', 'scientific publications'] -paging = True -language_support = True -time_range_support = True -safesearch = False -send_accept_language_header = True - - -def time_range_args(params): - """Returns a dictionary with a time range arguments based on - ``params['time_range']``. - - Google Scholar supports a detailed search by year. Searching by *last - month* or *last week* (as offered by SearXNG) is uncommon for scientific - publications and is not supported by Google Scholar. - - To limit the result list when the users selects a range, all the SearXNG - ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range - is set an empty dictionary of arguments is returned. Example; when - user selects a time range (current year minus one in 2022): - - .. code:: python - - { 'as_ylo' : 2021 } - - """ - ret_val = {} - if params['time_range'] in time_range_dict: - ret_val['as_ylo'] = datetime.now().year - 1 - return ret_val - - -def detect_google_captcha(dom): - """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is - not redirected to ``sorry.google.com``. - """ - if eval_xpath(dom, "//form[@id='gs_captcha_f']"): - raise SearxEngineCaptchaException() - - -def request(query, params): - """Google-Scholar search request""" - - google_info = get_google_info(params, traits) - # subdomain is: scholar.google.xy - google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.") - - args = { - 'q': query, - **google_info['params'], - 'start': (params['pageno'] - 1) * 10, - 'as_sdt': '2007', # include patents / to disable set '0,5' - 'as_vis': '0', # include citations / to disable set '1' - } - args.update(time_range_args(params)) - - params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args) - params['cookies'] = google_info['cookies'] - params['headers'].update(google_info['headers']) - return params - - -def parse_gs_a(text: Optional[str]): - """Parse the text written in green. - - Possible formats: - * "{authors} - {journal}, {year} - {publisher}" - * "{authors} - {year} - {publisher}" - * "{authors} - {publisher}" - """ - if text is None or text == "": - return None, None, None, None - - s_text = text.split(' - ') - authors = s_text[0].split(', ') - publisher = s_text[-1] - if len(s_text) != 3: - return authors, None, publisher, None - - # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}" - # get journal and year - journal_year = s_text[1].split(', ') - # journal is optional and may contains some coma - if len(journal_year) > 1: - journal = ', '.join(journal_year[0:-1]) - if journal == '…': - journal = None - else: - journal = None - # year - year = journal_year[-1] - try: - publishedDate = datetime.strptime(year.strip(), '%Y') - except ValueError: - publishedDate = None - return authors, journal, publisher, publishedDate - - -def response(resp): # pylint: disable=too-many-locals - """Parse response from Google Scholar""" - results = [] - - # convert the text to dom - dom = html.fromstring(resp.text) - detect_google_captcha(dom) - - # parse results - for result in eval_xpath_list(dom, '//div[@data-rp]'): - - title = extract_text(eval_xpath(result, './/h3[1]//a')) - - if not title: - # this is a [ZITATION] block - continue - - pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) - if pub_type: - pub_type = pub_type[1:-1].lower() - - url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0) - content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]')) - authors, journal, publisher, publishedDate = parse_gs_a( - extract_text(eval_xpath(result, './/div[@class="gs_a"]')) - ) - if publisher in url: - publisher = None - - # cited by - comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]')) - - # link to the html or pdf document - html_url = None - pdf_url = None - doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None) - doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) - if doc_type == "[PDF]": - pdf_url = doc_url - else: - html_url = doc_url - - results.append( - { - 'template': 'paper.html', - 'type': pub_type, - 'url': url, - 'title': title, - 'authors': authors, - 'publisher': publisher, - 'journal': journal, - 'publishedDate': publishedDate, - 'content': content, - 'comments': comments, - 'html_url': html_url, - 'pdf_url': pdf_url, - } - ) - - # parse suggestion - for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'): - # append suggestion - results.append({'suggestion': extract_text(suggestion)}) - - for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'): - results.append({'correction': extract_text(correction)}) - - return results diff --git a/apps/searxng/searx/engines/google_videos.py b/apps/searxng/searx/engines/google_videos.py deleted file mode 100755 index 985189d..0000000 --- a/apps/searxng/searx/engines/google_videos.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This is the implementation of the Google Videos engine. - -.. admonition:: Content-Security-Policy (CSP) - - This engine needs to allow images from the `data URLs`_ (prefixed with the - ``data:`` scheme):: - - Header set Content-Security-Policy "img-src 'self' data: ;" - -.. _data URLs: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs - -""" - -from typing import TYPE_CHECKING - -from urllib.parse import urlencode -from lxml import html - -from searx.utils import ( - eval_xpath, - eval_xpath_list, - eval_xpath_getindex, - extract_text, -) - -from searx.engines.google import fetch_traits # pylint: disable=unused-import -from searx.engines.google import ( - get_google_info, - time_range_dict, - filter_mapping, - suggestion_xpath, - detect_google_sorry, -) -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -# about -about = { - "website": 'https://www.google.com', - "wikidata_id": 'Q219885', - "official_api_documentation": 'https://developers.google.com/custom-search', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config - -categories = ['videos', 'web'] -paging = True -language_support = True -time_range_support = True -safesearch = True - - -def request(query, params): - """Google-Video search request""" - - google_info = get_google_info(params, traits) - - query_url = ( - 'https://' - + google_info['subdomain'] - + '/search' - + "?" - + urlencode( - { - 'q': query, - 'tbm': "vid", - 'start': 10 * params['pageno'], - **google_info['params'], - 'asearch': 'arc', - 'async': 'use_ac:true,_fmt:html', - } - ) - ) - - if params['time_range'] in time_range_dict: - query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) - if params['safesearch']: - query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) - params['url'] = query_url - - params['cookies'] = google_info['cookies'] - params['headers'].update(google_info['headers']) - return params - - -def response(resp): - """Get response from google's search request""" - results = [] - - detect_google_sorry(resp) - - # convert the text to dom - dom = html.fromstring(resp.text) - - # parse results - for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): - - img_src = eval_xpath_getindex(result, './/img/@src', 0, None) - if img_src is None: - continue - - title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0)) - url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0) - - c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) - content = extract_text(c_node) - pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]')) - length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]')) - - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'author': pub_info, - 'thumbnail': img_src, - 'length': length, - 'template': 'videos.html', - } - ) - - # parse suggestion - for suggestion in eval_xpath_list(dom, suggestion_xpath): - # append suggestion - results.append({'suggestion': extract_text(suggestion)}) - - return results diff --git a/apps/searxng/searx/engines/imdb.py b/apps/searxng/searx/engines/imdb.py deleted file mode 100755 index 0897b8d..0000000 --- a/apps/searxng/searx/engines/imdb.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint - -"""IMDB - Internet Movie Database - -Retrieves results from a basic search. Advanced search options are not -supported. IMDB's API is undocumented, here are some posts about: - -- https://stackoverflow.com/questions/1966503/does-imdb-provide-an-api -- https://rapidapi.com/blog/how-to-use-imdb-api/ - -An alternative that needs IMDPro_ is `IMDb and Box Office Mojo -`_ - -.. __IMDPro: https://pro.imdb.com/login - -""" - -import json - -about = { - "website": 'https://imdb.com/', - "wikidata_id": 'Q37312', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -categories = [] -paging = False - -# suggestion_url = "https://sg.media-imdb.com/suggestion/{letter}/{query}.json" -suggestion_url = "https://v2.sg.media-imdb.com/suggestion/{letter}/{query}.json" - -href_base = 'https://imdb.com/{category}/{entry_id}' - -search_categories = {"nm": "name", "tt": "title", "kw": "keyword", "co": "company", "ep": "episode"} - - -def request(query, params): - - query = query.replace(" ", "_").lower() - params['url'] = suggestion_url.format(letter=query[0], query=query) - - return params - - -def response(resp): - - suggestions = json.loads(resp.text) - results = [] - - for entry in suggestions.get('d', []): - - # https://developer.imdb.com/documentation/key-concepts#imdb-ids - entry_id = entry['id'] - categ = search_categories.get(entry_id[:2]) - if categ is None: - logger.error('skip unknown category tag %s in %s', entry_id[:2], entry_id) - continue - - title = entry['l'] - if 'q' in entry: - title += " (%s)" % entry['q'] - - content = '' - if 'rank' in entry: - content += "(%s) " % entry['rank'] - if 'y' in entry: - content += str(entry['y']) + " - " - if 's' in entry: - content += entry['s'] - - # imageUrl is the image itself, it is not a thumb! - image_url = entry.get('i', {}).get('imageUrl') - if image_url: - # get thumbnail - image_url_name, image_url_prefix = image_url.rsplit('.', 1) - # recipe to get the magic value: - # * search on imdb.com, look at the URL of the thumbnail on the right side of the screen - # * search using the imdb engine, compare the imageUrl and thumbnail URL - # QL75 : JPEG quality (?) - # UX280 : resize to width 320 - # 280,414 : size of the image (add white border) - magic = 'QL75_UX280_CR0,0,280,414_' - if not image_url_name.endswith('_V1_'): - magic = '_V1_' + magic - image_url = image_url_name + magic + '.' + image_url_prefix - results.append( - { - "title": title, - "url": href_base.format(category=categ, entry_id=entry_id), - "content": content, - "img_src": image_url, - } - ) - - return results diff --git a/apps/searxng/searx/engines/ina.py b/apps/searxng/searx/engines/ina.py deleted file mode 100755 index e5fba20..0000000 --- a/apps/searxng/searx/engines/ina.py +++ /dev/null @@ -1,75 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - INA (Videos) -""" - -from html import unescape -from urllib.parse import urlencode -from lxml import html -from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex - -# about -about = { - "website": 'https://www.ina.fr/', - "wikidata_id": 'Q1665109', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', - "language": 'fr', -} - -# engine dependent config -categories = ['videos'] -paging = True -page_size = 12 - -# search-url -base_url = 'https://www.ina.fr' -search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size' - -# specific xpath variables -results_xpath = '//div[@id="searchHits"]/div' -url_xpath = './/a/@href' -title_xpath = './/div[contains(@class,"title-bloc-small")]' -content_xpath = './/div[contains(@class,"sous-titre-fonction")]' -thumbnail_xpath = './/img/@data-src' -publishedDate_xpath = './/div[contains(@class,"dateAgenda")]' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(start=params['pageno'] * page_size, query=urlencode({'q': query})) - return params - - -# get response from search-request -def response(resp): - results = [] - - # we get html in a JSON container... - dom = html.fromstring(resp.text) - - # parse results - for result in eval_xpath_list(dom, results_xpath): - url_relative = eval_xpath_getindex(result, url_xpath, 0) - url = base_url + url_relative - title = unescape(extract_text(eval_xpath(result, title_xpath))) - thumbnail = extract_text(eval_xpath(result, thumbnail_xpath)) - content = extract_text(eval_xpath(result, publishedDate_xpath)) + extract_text( - eval_xpath(result, content_xpath) - ) - - # append result - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'template': 'videos.html', - 'thumbnail': thumbnail, - } - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/invidious.py b/apps/searxng/searx/engines/invidious.py deleted file mode 100755 index 29f2766..0000000 --- a/apps/searxng/searx/engines/invidious.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Invidious (Videos) -""" - -import time -import random -from urllib.parse import quote_plus -from dateutil import parser - -# about -about = { - "website": 'https://api.invidious.io/', - "wikidata_id": 'Q79343316', - "official_api_documentation": 'https://github.com/iv-org/documentation/blob/master/API.md', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ["videos", "music"] -paging = True -time_range_support = True - -# base_url can be overwritten by a list of URLs in the settings.yml -base_url = 'https://vid.puffyan.us' - - -def request(query, params): - time_range_dict = { - "day": "today", - "week": "week", - "month": "month", - "year": "year", - } - - if isinstance(base_url, list): - params["base_url"] = random.choice(base_url) - else: - params["base_url"] = base_url - - search_url = params["base_url"] + "/api/v1/search?q={query}" - params["url"] = search_url.format(query=quote_plus(query)) + "&page={pageno}".format(pageno=params["pageno"]) - - if params["time_range"] in time_range_dict: - params["url"] += "&date={timerange}".format(timerange=time_range_dict[params["time_range"]]) - - if params["language"] != "all": - lang = params["language"].split("-") - if len(lang) == 2: - params["url"] += "&range={lrange}".format(lrange=lang[1]) - - return params - - -def response(resp): - results = [] - - search_results = resp.json() - base_invidious_url = resp.search_params['base_url'] + "/watch?v=" - - for result in search_results: - rtype = result.get("type", None) - if rtype == "video": - videoid = result.get("videoId", None) - if not videoid: - continue - - url = base_invidious_url + videoid - thumbs = result.get("videoThumbnails", []) - thumb = next((th for th in thumbs if th["quality"] == "sddefault"), None) - if thumb: - thumbnail = thumb.get("url", "") - else: - thumbnail = "" - - publishedDate = parser.parse(time.ctime(result.get("published", 0))) - length = time.gmtime(result.get("lengthSeconds")) - if length.tm_hour: - length = time.strftime("%H:%M:%S", length) - else: - length = time.strftime("%M:%S", length) - - results.append( - { - "url": url, - "title": result.get("title", ""), - "content": result.get("description", ""), - 'length': length, - "template": "videos.html", - "author": result.get("author"), - "publishedDate": publishedDate, - "iframe_src": resp.search_params['base_url'] + '/embed/' + videoid, - "thumbnail": thumbnail, - } - ) - - return results diff --git a/apps/searxng/searx/engines/jisho.py b/apps/searxng/searx/engines/jisho.py deleted file mode 100755 index 7f4392b..0000000 --- a/apps/searxng/searx/engines/jisho.py +++ /dev/null @@ -1,137 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" -Jisho (the Japanese-English dictionary) -""" - -from urllib.parse import urlencode, urljoin - -# about -about = { - "website": 'https://jisho.org', - "wikidata_id": 'Q24568389', - "official_api_documentation": "https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api", - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', - "language": 'ja', -} - -categories = ['dictionaries'] -paging = False - -URL = 'https://jisho.org' -BASE_URL = 'https://jisho.org/word/' -SEARCH_URL = URL + '/api/v1/search/words?{query}' - - -def request(query, params): - query = urlencode({'keyword': query}) - params['url'] = SEARCH_URL.format(query=query) - logger.debug(f"query_url --> {params['url']}") - return params - - -def response(resp): - results = [] - first_result = True - - search_results = resp.json() - - for page in search_results.get('data', []): - # Entries that are purely from Wikipedia are excluded. - parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech') - if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition': - pass - - # Process alternative forms - alt_forms = [] - for title_raw in page['japanese']: - if 'word' not in title_raw: - alt_forms.append(title_raw['reading']) - else: - title = title_raw['word'] - if 'reading' in title_raw: - title += ' (' + title_raw['reading'] + ')' - alt_forms.append(title) - - result_url = urljoin(BASE_URL, page['slug']) - definitions = get_definitions(page) - - # For results, we'll return the URL, all alternative forms (as title), - # and all definitions (as description) truncated to 300 characters. - content = " ".join(f"{engdef}." for _, engdef, _ in definitions) - results.append( - {'url': result_url, 'title': ", ".join(alt_forms), 'content': content[:300] + (content[300:] and '...')} - ) - - # Like Wordnik, we'll return the first result in an infobox too. - if first_result: - first_result = False - results.append(get_infobox(alt_forms, result_url, definitions)) - - return results - - -def get_definitions(page): - # Process definitions - definitions = [] - for defn_raw in page['senses']: - extra = [] - # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions. - if defn_raw.get('tags'): - if defn_raw.get('info'): - # "usually written as kana: " - extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ') - else: - # abbreviation, archaism, etc. - extra.append(', '.join(defn_raw['tags']) + '. ') - elif defn_raw.get('info'): - # inconsistent - extra.append(', '.join(defn_raw['info']).capitalize() + '. ') - if defn_raw.get('restrictions'): - extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ') - definitions.append( - ( - ', '.join(defn_raw['parts_of_speech']), - '; '.join(defn_raw['english_definitions']), - ''.join(extra)[:-1], - ) - ) - return definitions - - -def get_infobox(alt_forms, result_url, definitions): - infobox_content = [] - # title & alt_forms - infobox_title = alt_forms[0] - if len(alt_forms) > 1: - infobox_content.append(f'

Other forms: {", ".join(alt_forms[1:])}

') - - # definitions - infobox_content.append( - ''' -
JMdict - and JMnedict - by EDRDG, CC BY-SA 3.0. -
    - ''' - ) - for pos, engdef, extra in definitions: - if pos == 'Wikipedia definition': - infobox_content.append('
Wikipedia, CC BY-SA 3.0.
    ') - pos = f'{pos}: ' if pos else '' - extra = f' ({extra})' if extra else '' - infobox_content.append(f'
  • {pos}{engdef}{extra}
  • ') - infobox_content.append('
') - - # - return { - 'infobox': infobox_title, - 'content': ''.join(infobox_content), - 'urls': [ - { - 'title': 'Jisho.org', - 'url': result_url, - } - ], - } diff --git a/apps/searxng/searx/engines/json_engine.py b/apps/searxng/searx/engines/json_engine.py deleted file mode 100755 index 2dd3bc5..0000000 --- a/apps/searxng/searx/engines/json_engine.py +++ /dev/null @@ -1,151 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -from collections.abc import Iterable -from json import loads -from urllib.parse import urlencode -from searx.utils import to_string, html_to_text - - -search_url = None -url_query = None -content_query = None -title_query = None -content_html_to_text = False -title_html_to_text = False -paging = False -suggestion_query = '' -results_query = '' - -cookies = {} -headers = {} -'''Some engines might offer different result based on cookies or headers. -Possible use-case: To set safesearch cookie or header to moderate.''' - -# parameters for engines with paging support -# -# number of results on each page -# (only needed if the site requires not a page number, but an offset) -page_size = 1 -# number of the first page (usually 0 or 1) -first_page_num = 1 - - -def iterate(iterable): - if type(iterable) == dict: - it = iterable.items() - - else: - it = enumerate(iterable) - for index, value in it: - yield str(index), value - - -def is_iterable(obj): - if type(obj) == str: - return False - return isinstance(obj, Iterable) - - -def parse(query): - q = [] - for part in query.split('/'): - if part == '': - continue - else: - q.append(part) - return q - - -def do_query(data, q): - ret = [] - if not q: - return ret - - qkey = q[0] - - for key, value in iterate(data): - - if len(q) == 1: - if key == qkey: - ret.append(value) - elif is_iterable(value): - ret.extend(do_query(value, q)) - else: - if not is_iterable(value): - continue - if key == qkey: - ret.extend(do_query(value, q[1:])) - else: - ret.extend(do_query(value, q)) - return ret - - -def query(data, query_string): - q = parse(query_string) - - return do_query(data, q) - - -def request(query, params): - query = urlencode({'q': query})[2:] - - fp = {'query': query} - if paging and search_url.find('{pageno}') >= 0: - fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num - - params['cookies'].update(cookies) - params['headers'].update(headers) - - params['url'] = search_url.format(**fp) - params['query'] = query - - return params - - -def identity(arg): - return arg - - -def response(resp): - results = [] - json = loads(resp.text) - - title_filter = html_to_text if title_html_to_text else identity - content_filter = html_to_text if content_html_to_text else identity - - if results_query: - rs = query(json, results_query) - if not len(rs): - return results - for result in rs[0]: - try: - url = query(result, url_query)[0] - title = query(result, title_query)[0] - except: - continue - try: - content = query(result, content_query)[0] - except: - content = "" - results.append( - { - 'url': to_string(url), - 'title': title_filter(to_string(title)), - 'content': content_filter(to_string(content)), - } - ) - else: - for url, title, content in zip(query(json, url_query), query(json, title_query), query(json, content_query)): - results.append( - { - 'url': to_string(url), - 'title': title_filter(to_string(title)), - 'content': content_filter(to_string(content)), - } - ) - - if not suggestion_query: - return results - for suggestion in query(json, suggestion_query): - results.append({'suggestion': suggestion}) - return results diff --git a/apps/searxng/searx/engines/kickass.py b/apps/searxng/searx/engines/kickass.py deleted file mode 100755 index 2636467..0000000 --- a/apps/searxng/searx/engines/kickass.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Kickass Torrent (Videos, Music, Files) -""" - -from lxml import html -from operator import itemgetter -from urllib.parse import quote, urljoin -from searx.utils import extract_text, get_torrent_size, convert_str_to_int - -# about -about = { - "website": 'https://kickass.so', - "wikidata_id": 'Q17062285', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['files'] -paging = True - -# search-url -url = 'https://kickass.cd/' -search_url = url + 'search/{search_term}/{pageno}/' - -# specific xpath variables -magnet_xpath = './/a[@title="Torrent magnet link"]' -torrent_xpath = './/a[@title="Download torrent file"]' -content_xpath = './/span[@class="font11px lightgrey block"]' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno']) - - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - search_res = dom.xpath('//table[@class="data"]//tr') - - # return empty array if nothing is found - if not search_res: - return [] - - # parse results - for result in search_res[1:]: - link = result.xpath('.//a[@class="cellMainLink"]')[0] - href = urljoin(url, link.attrib['href']) - title = extract_text(link) - content = extract_text(result.xpath(content_xpath)) - seed = extract_text(result.xpath('.//td[contains(@class, "green")]')) - leech = extract_text(result.xpath('.//td[contains(@class, "red")]')) - filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]')) - files = extract_text(result.xpath('.//td[contains(@class, "center")][2]')) - - seed = convert_str_to_int(seed) - leech = convert_str_to_int(leech) - - filesize, filesize_multiplier = filesize_info.split() - filesize = get_torrent_size(filesize, filesize_multiplier) - if files.isdigit(): - files = int(files) - else: - files = None - - magnetlink = result.xpath(magnet_xpath)[0].attrib['href'] - - torrentfile = result.xpath(torrent_xpath)[0].attrib['href'] - torrentfileurl = quote(torrentfile, safe="%/:=&?~#+!$,;'@()*") - - # append result - results.append( - { - 'url': href, - 'title': title, - 'content': content, - 'seed': seed, - 'leech': leech, - 'filesize': filesize, - 'files': files, - 'magnetlink': magnetlink, - 'torrentfile': torrentfileurl, - 'template': 'torrent.html', - } - ) - - # return results sorted by seeder - return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/apps/searxng/searx/engines/lemmy.py b/apps/searxng/searx/engines/lemmy.py deleted file mode 100755 index 8c1b221..0000000 --- a/apps/searxng/searx/engines/lemmy.py +++ /dev/null @@ -1,203 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This engine uses the Lemmy API (https://lemmy.ml/api/v3/search), which is -documented at `lemmy-js-client`_ / `Interface Search`_. Since Lemmy is -federated, results are from many different, independent lemmy instances, and not -only the official one. - -.. _lemmy-js-client: https://join-lemmy.org/api/modules.html -.. _Interface Search: https://join-lemmy.org/api/interfaces/Search.html - -Configuration -============= - -The engine has the following additional settings: - -- :py:obj:`base_url` -- :py:obj:`lemmy_type` - -This implementation is used by different lemmy engines in the :ref:`settings.yml -`: - -.. code:: yaml - - - name: lemmy communities - lemmy_type: Communities - ... - - name: lemmy users - lemmy_type: Users - ... - - name: lemmy posts - lemmy_type: Posts - ... - - name: lemmy comments - lemmy_type: Comments - ... - -Implementations -=============== - -""" - -from datetime import datetime -from urllib.parse import urlencode - -from markdown_it import MarkdownIt -from flask_babel import gettext - -from searx.utils import html_to_text - -about = { - "website": 'https://lemmy.ml/', - "wikidata_id": 'Q84777032', - "official_api_documentation": "https://join-lemmy.org/api/", - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} -paging = True -categories = ['social media'] - -base_url = "https://lemmy.ml/" -"""By default, https://lemmy.ml is used for providing the results. If you want -to use a different lemmy instance, you can specify ``base_url``. -""" - -lemmy_type = "Communities" -"""Any of ``Communities``, ``Users``, ``Posts``, ``Comments``""" - - -def request(query, params): - args = { - 'q': query, - 'page': params['pageno'], - 'type_': lemmy_type, - } - - params['url'] = f"{base_url}api/v3/search?{urlencode(args)}" - return params - - -def _format_content(content): - html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content) - return html_to_text(html) - - -def _get_communities(json): - results = [] - - for result in json["communities"]: - counts = result['counts'] - metadata = ( - f"{gettext('subscribers')}: {counts.get('subscribers', 0)}" - f" | {gettext('posts')}: {counts.get('posts', 0)}" - f" | {gettext('active users')}: {counts.get('users_active_half_year', 0)}" - ) - results.append( - { - 'url': result['community']['actor_id'], - 'title': result['community']['title'], - 'content': _format_content(result['community'].get('description', '')), - 'img_src': result['community'].get('icon', result['community'].get('banner')), - 'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'), - 'metadata': metadata, - } - ) - return results - - -def _get_users(json): - results = [] - - for result in json["users"]: - results.append( - { - 'url': result['person']['actor_id'], - 'title': result['person']['name'], - 'content': _format_content(result['person'].get('bio', '')), - } - ) - - return results - - -def _get_posts(json): - results = [] - - for result in json["posts"]: - user = result['creator'].get('display_name', result['creator']['name']) - - img_src = None - if result['post'].get('thumbnail_url'): - img_src = result['post']['thumbnail_url'] + '?format=webp&thumbnail=208' - - metadata = ( - f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}" - f" | {gettext('user')}: {user}" - f" | {gettext('comments')}: {result['counts']['comments']}" - f" | {gettext('community')}: {result['community']['title']}" - ) - - content = result['post'].get('body', '').strip() - if content: - content = _format_content(content) - - results.append( - { - 'url': result['post']['ap_id'], - 'title': result['post']['name'], - 'content': content, - 'img_src': img_src, - 'publishedDate': datetime.strptime(result['post']['published'][:19], '%Y-%m-%dT%H:%M:%S'), - 'metadata': metadata, - } - ) - - return results - - -def _get_comments(json): - results = [] - - for result in json["comments"]: - user = result['creator'].get('display_name', result['creator']['name']) - - content = result['comment'].get('content', '').strip() - if content: - content = _format_content(content) - - metadata = ( - f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}" - f" | {gettext('user')}: {user}" - f" | {gettext('community')}: {result['community']['title']}" - ) - - results.append( - { - 'url': result['comment']['ap_id'], - 'title': result['post']['name'], - 'content': _format_content(result['comment']['content']), - 'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'), - 'metadata': metadata, - } - ) - - return results - - -def response(resp): - json = resp.json() - - if lemmy_type == "Communities": - return _get_communities(json) - - if lemmy_type == "Users": - return _get_users(json) - - if lemmy_type == "Posts": - return _get_posts(json) - - if lemmy_type == "Comments": - return _get_comments(json) - - raise ValueError(f"Unsupported lemmy type: {lemmy_type}") diff --git a/apps/searxng/searx/engines/lingva.py b/apps/searxng/searx/engines/lingva.py deleted file mode 100755 index bf51b70..0000000 --- a/apps/searxng/searx/engines/lingva.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Lingva (alternative Google Translate frontend)""" - -from json import loads - -about = { - "website": 'https://lingva.ml', - "wikidata_id": None, - "official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -engine_type = 'online_dictionary' -categories = ['general'] - -url = "https://lingva.ml" -search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}" - - -def request(_query, params): - params['url'] = search_url.format( - url=url, from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query'] - ) - return params - - -def response(resp): - results = [] - - result = loads(resp.text) - info = result["info"] - from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1]) - - if "typo" in info: - results.append({"suggestion": from_to_prefix + info["typo"]}) - - if 'definitions' in info: # pylint: disable=too-many-nested-blocks - for definition in info['definitions']: - if 'list' in definition: - for item in definition['list']: - if 'synonyms' in item: - for synonym in item['synonyms']: - results.append({"suggestion": from_to_prefix + synonym}) - - infobox = "" - - for translation in info["extraTranslations"]: - infobox += f"{translation['type']}" - - for word in translation["list"]: - infobox += f"
{word['word']}
" - - for meaning in word["meanings"]: - infobox += f"
{meaning}
" - - infobox += "
" - - results.append( - { - 'infobox': result["translation"], - 'content': infobox, - } - ) - - return results diff --git a/apps/searxng/searx/engines/loc.py b/apps/searxng/searx/engines/loc.py deleted file mode 100755 index 0b2f3a6..0000000 --- a/apps/searxng/searx/engines/loc.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - - Library of Congress : images from Prints and Photographs Online Catalog - -""" - -from json import loads -from urllib.parse import urlencode - - -about = { - "website": 'https://www.loc.gov/pictures/', - "wikidata_id": 'Q131454', - "official_api_documentation": 'https://www.loc.gov/pictures/api', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['images'] - -paging = True - -base_url = 'https://loc.gov/pictures/search/?' -search_string = "&sp={page}&{query}&fo=json" - -IMG_SRC_FIXES = { - 'https://tile.loc.gov/storage-services/': 'https://tile.loc.gov/storage-services/', - 'https://loc.gov/pictures/static/images/': 'https://tile.loc.gov/storage-services/', - 'https://www.loc.gov/pictures/cdn/': 'https://tile.loc.gov/storage-services/', -} - - -def request(query, params): - - search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno']) - - params['url'] = base_url + search_path - - return params - - -def response(resp): - results = [] - - json_data = loads(resp.text) - - for result in json_data['results']: - img_src = result['image']['full'] - for url_prefix, url_replace in IMG_SRC_FIXES.items(): - if img_src.startswith(url_prefix): - img_src = img_src.replace(url_prefix, url_replace) - break - else: - img_src = result['image']['thumb'] - results.append( - { - 'url': result['links']['item'], - 'title': result['title'], - 'img_src': img_src, - 'thumbnail_src': result['image']['thumb'], - 'author': result['creator'], - 'template': 'images.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/mediathekviewweb.py b/apps/searxng/searx/engines/mediathekviewweb.py deleted file mode 100755 index 5570ebe..0000000 --- a/apps/searxng/searx/engines/mediathekviewweb.py +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""MediathekViewWeb (API) - -""" - -import datetime -from json import loads, dumps - -about = { - "website": 'https://mediathekviewweb.de/', - "wikidata_id": 'Q27877380', - "official_api_documentation": 'https://gist.github.com/bagbag/a2888478d27de0e989cf777f81fb33de', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', - "language": "de", -} - -categories = ['videos'] -paging = True -time_range_support = False -safesearch = False - - -def request(query, params): - - params['url'] = 'https://mediathekviewweb.de/api/query' - params['method'] = 'POST' - params['headers']['Content-type'] = 'text/plain' - params['data'] = dumps( - { - 'queries': [ - { - 'fields': [ - 'title', - 'topic', - ], - 'query': query, - }, - ], - 'sortBy': 'timestamp', - 'sortOrder': 'desc', - 'future': True, - 'offset': (params['pageno'] - 1) * 10, - 'size': 10, - } - ) - return params - - -def response(resp): - - resp = loads(resp.text) - - mwv_result = resp['result'] - mwv_result_list = mwv_result['results'] - - results = [] - - for item in mwv_result_list: - - item['hms'] = str(datetime.timedelta(seconds=item['duration'])) - - results.append( - { - 'url': item['url_video_hd'].replace("http://", "https://"), - 'title': "%(channel)s: %(title)s (%(hms)s)" % item, - 'length': item['hms'], - 'content': "%(description)s" % item, - 'iframe_src': item['url_video_hd'].replace("http://", "https://"), - 'template': 'videos.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/mediawiki.py b/apps/searxng/searx/engines/mediawiki.py deleted file mode 100755 index 6a9ac97..0000000 --- a/apps/searxng/searx/engines/mediawiki.py +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by -the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have -endpoints that follow this pattern:: - - https://{base_url}/w/api.php?action=query&list=search&format=json - -.. note:: - - In its actual state, this engine is implemented to parse JSON result - (`format=json`_) from a search query (`list=search`_). If you need other - ``action`` and ``list`` types ask SearXNG developers to extend the - implementation according to your needs. - -.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page -.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query -.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch -.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json - -Configuration -============= - -Request: - -- :py:obj:`base_url` -- :py:obj:`search_type` -- :py:obj:`srenablerewrites` -- :py:obj:`srsort` -- :py:obj:`srprop` - -Implementations -=============== - -""" -from __future__ import annotations -from typing import TYPE_CHECKING - -from datetime import datetime -from urllib.parse import urlencode, quote - -from searx.utils import html_to_text -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -# about -about = { - "website": None, - "wikidata_id": None, - "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['general'] -paging = True -number_of_results = 5 - -search_type: str = 'nearmatch' -"""Which type of search to perform. One of the following values: ``nearmatch``, -``text`` or ``title``. - -See ``srwhat`` argument in `list=search`_ documentation. -""" - -srenablerewrites: bool = True -"""Enable internal query rewriting (Type: boolean). Some search backends can -rewrite the query into another which is thought to provide better results, for -instance by correcting spelling errors. - -See ``srenablerewrites`` argument in `list=search`_ documentation. -""" - -srsort: str = 'relevance' -"""Set the sort order of returned results. One of the following values: -``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``, -``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``, -``none``, ``random``, ``relevance``, ``user_random``. - -See ``srenablerewrites`` argument in `list=search`_ documentation. -""" - -srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet' -"""Which properties to return. - -See ``srprop`` argument in `list=search`_ documentation. -""" - -base_url: str = 'https://{language}.wikipedia.org/' -"""Base URL of the Wikimedia wiki. - -``{language}``: - ISO 639-1 language code (en, de, fr ..) of the search language. -""" - -timestamp_format = '%Y-%m-%dT%H:%M:%SZ' -"""The longhand version of MediaWiki time strings.""" - - -def request(query, params): - - # write search-language back to params, required in response - - if params['language'] == 'all': - params['language'] = 'en' - else: - params['language'] = params['language'].split('-')[0] - - if base_url.endswith('/'): - api_url = base_url + 'w/api.php?' - else: - api_url = base_url + '/w/api.php?' - api_url = api_url.format(language=params['language']) - - offset = (params['pageno'] - 1) * number_of_results - - args = { - 'action': 'query', - 'list': 'search', - 'format': 'json', - 'srsearch': query, - 'sroffset': offset, - 'srlimit': number_of_results, - 'srwhat': search_type, - 'srprop': srprop, - 'srsort': srsort, - } - if srenablerewrites: - args['srenablerewrites'] = '1' - - params['url'] = api_url + urlencode(args) - return params - - -# get response from search-request -def response(resp): - - results = [] - search_results = resp.json() - - # return empty array if there are no results - if not search_results.get('query', {}).get('search'): - return [] - - for result in search_results['query']['search']: - - if result.get('snippet', '').startswith('#REDIRECT'): - continue - - title = result['title'] - sectiontitle = result.get('sectiontitle') - content = html_to_text(result.get('snippet', '')) - metadata = html_to_text(result.get('categorysnippet', '')) - timestamp = result.get('timestamp') - - url = ( - base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode()) - ) - if sectiontitle: - # in case of sectiontitle create a link to the section in the wiki page - url += '#' + quote(sectiontitle.replace(' ', '_').encode()) - title += ' / ' + sectiontitle - - item = {'url': url, 'title': title, 'content': content, 'metadata': metadata} - - if timestamp: - item['publishedDate'] = datetime.strptime(timestamp, timestamp_format) - - results.append(item) - - # return results - return results diff --git a/apps/searxng/searx/engines/meilisearch.py b/apps/searxng/searx/engines/meilisearch.py deleted file mode 100755 index 0c23702..0000000 --- a/apps/searxng/searx/engines/meilisearch.py +++ /dev/null @@ -1,88 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. sidebar:: info - - - :origin:`meilisearch.py ` - - `MeiliSearch `_ - - `MeiliSearch Documentation `_ - - `Install MeiliSearch - `_ - -MeiliSearch_ is aimed at individuals and small companies. It is designed for -small-scale (less than 10 million documents) data collections. E.g. it is great -for storing web pages you have visited and searching in the contents later. - -The engine supports faceted search, so you can search in a subset of documents -of the collection. Furthermore, you can search in MeiliSearch_ instances that -require authentication by setting ``auth_token``. - -Example -======= - -Here is a simple example to query a Meilisearch instance: - -.. code:: yaml - - - name: meilisearch - engine: meilisearch - shortcut: mes - base_url: http://localhost:7700 - index: my-index - enable_http: true - -""" - -# pylint: disable=global-statement - -from json import loads, dumps - - -base_url = 'http://localhost:7700' -index = '' -auth_key = '' -facet_filters = [] -_search_url = '' -result_template = 'key-value.html' -categories = ['general'] -paging = True - - -def init(_): - if index == '': - raise ValueError('index cannot be empty') - - global _search_url - _search_url = base_url + '/indexes/' + index + '/search' - - -def request(query, params): - if auth_key != '': - params['headers']['X-Meili-API-Key'] = auth_key - - params['headers']['Content-Type'] = 'application/json' - params['url'] = _search_url - params['method'] = 'POST' - - data = { - 'q': query, - 'offset': 10 * (params['pageno'] - 1), - 'limit': 10, - } - if len(facet_filters) > 0: - data['facetFilters'] = facet_filters - - params['data'] = dumps(data) - - return params - - -def response(resp): - results = [] - - resp_json = loads(resp.text) - for result in resp_json['hits']: - r = {key: str(value) for key, value in result.items()} - r['template'] = result_template - results.append(r) - - return results diff --git a/apps/searxng/searx/engines/metacpan.py b/apps/searxng/searx/engines/metacpan.py deleted file mode 100755 index 9d7f539..0000000 --- a/apps/searxng/searx/engines/metacpan.py +++ /dev/null @@ -1,79 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""metacpan -""" - -from urllib.parse import urlunparse -from json import dumps - -# about -about = { - "website": 'https://metacpan.org/', - "wikidata_id": 'Q841507', - "official_api_documentation": 'https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -number_of_results = 20 # Don't put this over 5000 -categories = ["it", "packages"] -disabled = True -shortcut = "cpan" -paging = True - -query_data_template = { - 'query': { - 'multi_match': { - 'type': 'most_fields', - 'fields': ['documentation', 'documentation.*'], - 'analyzer': 'camelcase', - } - }, - 'filter': { - 'bool': { - 'must': [ - {'exists': {'field': 'documentation'}}, - {'term': {'status': 'latest'}}, - {'term': {'indexed': 1}}, - {'term': {'authorized': 1}}, - ] - } - }, - "sort": [ - {"_score": {"order": "desc"}}, - {"date": {"order": "desc"}}, - ], - '_source': ['documentation', "abstract"], - 'size': number_of_results, -} -search_url = urlunparse(["https", "fastapi.metacpan.org", "/v1/file/_search", "", "", ""]) - - -def request(query, params): - params["url"] = search_url - params["method"] = "POST" - query_data = query_data_template - query_data["query"]["multi_match"]["query"] = query - query_data["from"] = (params["pageno"] - 1) * number_of_results - params["data"] = dumps(query_data) - return params - - -def response(resp): - results = [] - - search_results = resp.json()["hits"]["hits"] - for result in search_results: - fields = result["_source"] - module = fields["documentation"] - results.append( - { - "url": "https://metacpan.org/pod/" + module, - "title": module, - "content": fields.get("abstract", ""), - } - ) - - return results diff --git a/apps/searxng/searx/engines/mixcloud.py b/apps/searxng/searx/engines/mixcloud.py deleted file mode 100755 index 3f25569..0000000 --- a/apps/searxng/searx/engines/mixcloud.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Mixcloud (Music) - -""" - -from urllib.parse import urlencode -from dateutil import parser - -# about -about = { - "website": 'https://www.mixcloud.com/', - "wikidata_id": 'Q6883832', - "official_api_documentation": 'http://www.mixcloud.com/developers/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['music'] -paging = True - -# search-url -url = 'https://api.mixcloud.com/' -search_url = url + 'search/?{query}&type=cloudcast&limit=10&offset={offset}' -iframe_src = "https://www.mixcloud.com/widget/iframe/?feed={url}" - - -def request(query, params): - offset = (params['pageno'] - 1) * 10 - params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) - return params - - -def response(resp): - results = [] - search_res = resp.json() - - for result in search_res.get('data', []): - - r_url = result['url'] - publishedDate = parser.parse(result['created_time']) - res = { - 'url': r_url, - 'title': result['name'], - 'iframe_src': iframe_src.format(url=r_url), - 'img_src': result['pictures']['medium'], - 'publishedDate': publishedDate, - 'content': result['user']['name'], - } - results.append(res) - - return results diff --git a/apps/searxng/searx/engines/mongodb.py b/apps/searxng/searx/engines/mongodb.py deleted file mode 100755 index 260d6da..0000000 --- a/apps/searxng/searx/engines/mongodb.py +++ /dev/null @@ -1,103 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""MongoDB_ is a document based database program that handles JSON like data. -Before configuring the ``mongodb`` engine, you must install the dependency -pymongo_. - -Configuration -============= - -In order to query MongoDB_, you have to select a ``database`` and a -``collection``. Furthermore, you have to select a ``key`` that is going to be -searched. MongoDB_ also supports the option ``exact_match_only``, so configure -it as you wish. - -Example -======= - -Below is an example configuration for using a MongoDB collection: - -.. code:: yaml - - # MongoDB engine - # Required dependency: pymongo - - - name: mymongo - engine: mongodb - shortcut: md - exact_match_only: false - host: '127.0.0.1' - port: 27017 - enable_http: true - results_per_page: 20 - database: 'business' - collection: 'reviews' # name of the db collection - key: 'name' # key in the collection to search for - -Implementations -=============== - -""" - -import re - -try: - from pymongo import MongoClient # type: ignore -except ImportError: - # import error is ignored because the admin has to install pymongo manually - # to use the engine - pass - - -engine_type = 'offline' - -# mongodb connection variables -host = '127.0.0.1' -port = 27017 -username = '' -password = '' -database = None -collection = None -key = None - -# engine specific variables -paging = True -results_per_page = 20 -exact_match_only = False -result_template = 'key-value.html' - -_client = None - - -def init(_): - connect() - - -def connect(): - global _client # pylint: disable=global-statement - kwargs = {'port': port} - if username: - kwargs['username'] = username - if password: - kwargs['password'] = password - _client = MongoClient(host, **kwargs)[database][collection] - - -def search(query, params): - results = [] - if exact_match_only: - q = {'$eq': query} - else: - _re = re.compile('.*{0}.*'.format(re.escape(query)), re.I | re.M) - q = {'$regex': _re} - - query = _client.find({key: q}).skip((params['pageno'] - 1) * results_per_page).limit(results_per_page) - - results.append({'number_of_results': query.count()}) - for r in query: - del r['_id'] - r = {str(k): str(v) for k, v in r.items()} - r['template'] = result_template - results.append(r) - - return results diff --git a/apps/searxng/searx/engines/mysql_server.py b/apps/searxng/searx/engines/mysql_server.py deleted file mode 100755 index 82bb37f..0000000 --- a/apps/searxng/searx/engines/mysql_server.py +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""MySQL is said to be the most popular open source database. Before enabling -MySQL engine, you must install the package ``mysql-connector-python``. - -The authentication plugin is configurable by setting ``auth_plugin`` in the -attributes. By default it is set to ``caching_sha2_password``. - -Example -======= - -This is an example configuration for querying a MySQL server: - -.. code:: yaml - - - name: my_database - engine: mysql_server - database: my_database - username: searxng - password: password - limit: 5 - query_str: 'SELECT * from my_table WHERE my_column=%(query)s' - -Implementations -=============== - -""" - -try: - import mysql.connector # type: ignore -except ImportError: - # import error is ignored because the admin has to install mysql manually to use - # the engine - pass - -engine_type = 'offline' -auth_plugin = 'caching_sha2_password' -host = "127.0.0.1" -port = 3306 -database = "" -username = "" -password = "" -query_str = "" -limit = 10 -paging = True -result_template = 'key-value.html' -_connection = None - - -def init(engine_settings): - global _connection # pylint: disable=global-statement - - if 'query_str' not in engine_settings: - raise ValueError('query_str cannot be empty') - - if not engine_settings['query_str'].lower().startswith('select '): - raise ValueError('only SELECT query is supported') - - _connection = mysql.connector.connect( - database=database, - user=username, - password=password, - host=host, - port=port, - auth_plugin=auth_plugin, - ) - - -def search(query, params): - query_params = {'query': query} - query_to_run = query_str + ' LIMIT {0} OFFSET {1}'.format(limit, (params['pageno'] - 1) * limit) - - with _connection.cursor() as cur: - cur.execute(query_to_run, query_params) - - return _fetch_results(cur) - - -def _fetch_results(cur): - results = [] - for res in cur: - result = dict(zip(cur.column_names, map(str, res))) - result['template'] = result_template - results.append(result) - - return results diff --git a/apps/searxng/searx/engines/nyaa.py b/apps/searxng/searx/engines/nyaa.py deleted file mode 100755 index bdd3ea6..0000000 --- a/apps/searxng/searx/engines/nyaa.py +++ /dev/null @@ -1,115 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Nyaa.si (Anime Bittorrent tracker) -""" - -from lxml import html -from urllib.parse import urlencode -from searx.utils import extract_text, get_torrent_size, int_or_zero - -# about -about = { - "website": 'https://nyaa.si/', - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['files'] -paging = True - -# search-url -base_url = 'https://nyaa.si/' -search_url = base_url + '?page=search&{query}&offset={offset}' - -# xpath queries -xpath_results = '//table[contains(@class, "torrent-list")]//tr[not(th)]' -xpath_category = './/td[1]/a[1]' -xpath_title = './/td[2]/a[last()]' -xpath_torrent_links = './/td[3]/a' -xpath_filesize = './/td[4]/text()' -xpath_seeds = './/td[6]/text()' -xpath_leeches = './/td[7]/text()' -xpath_downloads = './/td[8]/text()' - - -# do search-request -def request(query, params): - query = urlencode({'term': query}) - params['url'] = search_url.format(query=query, offset=params['pageno']) - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - for result in dom.xpath(xpath_results): - # defaults - filesize = 0 - magnet_link = "" - torrent_link = "" - - # category in which our torrent belongs - try: - category = result.xpath(xpath_category)[0].attrib.get('title') - except: - pass - - # torrent title - page_a = result.xpath(xpath_title)[0] - title = extract_text(page_a) - - # link to the page - href = base_url + page_a.attrib.get('href') - - for link in result.xpath(xpath_torrent_links): - url = link.attrib.get('href') - if 'magnet' in url: - # link to the magnet - magnet_link = url - else: - # link to the torrent file - torrent_link = url - - # seed count - seed = int_or_zero(result.xpath(xpath_seeds)) - - # leech count - leech = int_or_zero(result.xpath(xpath_leeches)) - - # torrent downloads count - downloads = int_or_zero(result.xpath(xpath_downloads)) - - # let's try to calculate the torrent size - try: - filesize_info = result.xpath(xpath_filesize)[0] - filesize, filesize_multiplier = filesize_info.split() - filesize = get_torrent_size(filesize, filesize_multiplier) - except: - pass - - # content string contains all information not included into template - content = 'Category: "{category}". Downloaded {downloads} times.' - content = content.format(category=category, downloads=downloads) - - results.append( - { - 'url': href, - 'title': title, - 'content': content, - 'seed': seed, - 'leech': leech, - 'filesize': filesize, - 'torrentfile': torrent_link, - 'magnetlink': magnet_link, - 'template': 'torrent.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/opensemantic.py b/apps/searxng/searx/engines/opensemantic.py deleted file mode 100755 index 64bc321..0000000 --- a/apps/searxng/searx/engines/opensemantic.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Open Semantic Search -""" - -from dateutil import parser -from json import loads -from urllib.parse import quote - -# about -about = { - "website": 'https://www.opensemanticsearch.org/', - "wikidata_id": None, - "official_api_documentation": 'https://www.opensemanticsearch.org/dev', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -base_url = 'http://localhost:8983/solr/opensemanticsearch/' -search_string = 'query?q={query}' - - -def request(query, params): - search_path = search_string.format( - query=quote(query), - ) - params['url'] = base_url + search_path - return params - - -def response(resp): - results = [] - data = loads(resp.text) - docs = data.get('response', {}).get('docs', []) - - for current in docs: - item = {} - item['url'] = current['id'] - item['title'] = current['title_txt_txt_en'] - if current.get('content_txt'): - item['content'] = current['content_txt'][0] - item['publishedDate'] = parser.parse(current['file_modified_dt']) - results.append(item) - - return results diff --git a/apps/searxng/searx/engines/openstreetmap.py b/apps/searxng/searx/engines/openstreetmap.py deleted file mode 100755 index 4f799fc..0000000 --- a/apps/searxng/searx/engines/openstreetmap.py +++ /dev/null @@ -1,451 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""OpenStreetMap (Map) - -""" - -import re -from json import loads -from urllib.parse import urlencode -from functools import partial - -from flask_babel import gettext - -from searx.data import OSM_KEYS_TAGS, CURRENCIES -from searx.utils import searx_useragent -from searx.external_urls import get_external_url -from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail - -# about -about = { - "website": 'https://www.openstreetmap.org/', - "wikidata_id": 'Q936', - "official_api_documentation": 'http://wiki.openstreetmap.org/wiki/Nominatim', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['map'] -paging = False -language_support = True -send_accept_language_header = True - -# search-url -base_url = 'https://nominatim.openstreetmap.org/' -search_string = 'search?{query}&polygon_geojson=1&format=jsonv2&addressdetails=1&extratags=1&dedupe=1' -result_id_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' -result_lat_lon_url = 'https://www.openstreetmap.org/?mlat={lat}&mlon={lon}&zoom={zoom}&layers=M' - -route_url = 'https://graphhopper.com/maps/?point={}&point={}&locale=en-US&vehicle=car&weighting=fastest&turn_costs=true&use_miles=false&layer=Omniscale' # pylint: disable=line-too-long -route_re = re.compile('(?:from )?(.+) to (.+)') - -wikidata_image_sparql = """ -select ?item ?itemLabel ?image ?sign ?symbol ?website ?wikipediaName -where { - hint:Query hint:optimizer "None". - values ?item { %WIKIDATA_IDS% } - OPTIONAL { ?item wdt:P18|wdt:P8517|wdt:P4291|wdt:P5252|wdt:P3451|wdt:P4640|wdt:P5775|wdt:P2716|wdt:P1801|wdt:P4896 ?image } - OPTIONAL { ?item wdt:P1766|wdt:P8505|wdt:P8667 ?sign } - OPTIONAL { ?item wdt:P41|wdt:P94|wdt:P154|wdt:P158|wdt:P2910|wdt:P4004|wdt:P5962|wdt:P8972 ?symbol } - OPTIONAL { ?item wdt:P856 ?website } - SERVICE wikibase:label { - bd:serviceParam wikibase:language "%LANGUAGE%,en". - ?item rdfs:label ?itemLabel . - } - OPTIONAL { - ?wikipediaUrl schema:about ?item; - schema:isPartOf/wikibase:wikiGroup "wikipedia"; - schema:name ?wikipediaName; - schema:inLanguage "%LANGUAGE%" . - } -} -ORDER by ?item -""" - - -# key value that are link: mapping functions -# 'mapillary': P1947 -# but https://github.com/kartaview/openstreetcam.org/issues/60 -# but https://taginfo.openstreetmap.org/keys/kartaview ... -def value_to_https_link(value): - http = 'http://' - if value.startswith(http): - value = 'https://' + value[len(http) :] - return (value, value) - - -def value_to_website_link(value): - value = value.split(';')[0] - return (value, value) - - -def value_wikipedia_link(value): - value = value.split(':', 1) - return ('https://{0}.wikipedia.org/wiki/{1}'.format(*value), '{1} ({0})'.format(*value)) - - -def value_with_prefix(prefix, value): - return (prefix + value, value) - - -VALUE_TO_LINK = { - 'website': value_to_website_link, - 'contact:website': value_to_website_link, - 'email': partial(value_with_prefix, 'mailto:'), - 'contact:email': partial(value_with_prefix, 'mailto:'), - 'contact:phone': partial(value_with_prefix, 'tel:'), - 'phone': partial(value_with_prefix, 'tel:'), - 'fax': partial(value_with_prefix, 'fax:'), - 'contact:fax': partial(value_with_prefix, 'fax:'), - 'contact:mastodon': value_to_https_link, - 'facebook': value_to_https_link, - 'contact:facebook': value_to_https_link, - 'contact:foursquare': value_to_https_link, - 'contact:instagram': value_to_https_link, - 'contact:linkedin': value_to_https_link, - 'contact:pinterest': value_to_https_link, - 'contact:telegram': value_to_https_link, - 'contact:tripadvisor': value_to_https_link, - 'contact:twitter': value_to_https_link, - 'contact:yelp': value_to_https_link, - 'contact:youtube': value_to_https_link, - 'contact:webcam': value_to_website_link, - 'wikipedia': value_wikipedia_link, - 'wikidata': partial(value_with_prefix, 'https://wikidata.org/wiki/'), - 'brand:wikidata': partial(value_with_prefix, 'https://wikidata.org/wiki/'), -} -KEY_ORDER = [ - 'cuisine', - 'organic', - 'delivery', - 'delivery:covid19', - 'opening_hours', - 'opening_hours:covid19', - 'fee', - 'payment:*', - 'currency:*', - 'outdoor_seating', - 'bench', - 'wheelchair', - 'level', - 'building:levels', - 'bin', - 'public_transport', - 'internet_access:ssid', -] -KEY_RANKS = {k: i for i, k in enumerate(KEY_ORDER)} - - -def request(query, params): - """do search-request""" - params['url'] = base_url + search_string.format(query=urlencode({'q': query})) - params['route'] = route_re.match(query) - params['headers']['User-Agent'] = searx_useragent() - if 'Accept-Language' not in params['headers']: - params['headers']['Accept-Language'] = 'en' - return params - - -def response(resp): - """get response from search-request""" - results = [] - nominatim_json = loads(resp.text) - user_language = resp.search_params['language'] - - if resp.search_params['route']: - results.append( - { - 'answer': gettext('Get directions'), - 'url': route_url.format(*resp.search_params['route'].groups()), - } - ) - - fetch_wikidata(nominatim_json, user_language) - - for result in nominatim_json: - title, address = get_title_address(result) - - # ignore result without title - if not title: - continue - - url, osm, geojson = get_url_osm_geojson(result) - img_src = get_thumbnail(get_img_src(result)) - links, link_keys = get_links(result, user_language) - data = get_data(result, user_language, link_keys) - - results.append( - { - 'template': 'map.html', - 'title': title, - 'address': address, - 'address_label': get_key_label('addr', user_language), - 'url': url, - 'osm': osm, - 'geojson': geojson, - 'img_src': img_src, - 'links': links, - 'data': data, - 'type': get_tag_label(result.get('category'), result.get('type', ''), user_language), - 'type_icon': result.get('icon'), - 'content': '', - 'longitude': result['lon'], - 'latitude': result['lat'], - 'boundingbox': result['boundingbox'], - } - ) - - return results - - -def get_wikipedia_image(raw_value): - if not raw_value: - return None - return get_external_url('wikimedia_image', raw_value) - - -def fetch_wikidata(nominatim_json, user_language): - """Update nominatim_json using the result of an unique to wikidata - - For result in nominatim_json: - If result['extratags']['wikidata'] or r['extratags']['wikidata link']: - Set result['wikidata'] to { 'image': ..., 'image_sign':..., 'image_symbal':... } - Set result['extratags']['wikipedia'] if not defined - Set result['extratags']['contact:website'] if not defined - """ - wikidata_ids = [] - wd_to_results = {} - for result in nominatim_json: - e = result.get("extratags") - if e: - # ignore brand:wikidata - wd_id = e.get("wikidata", e.get("wikidata link")) - if wd_id and wd_id not in wikidata_ids: - wikidata_ids.append("wd:" + wd_id) - wd_to_results.setdefault(wd_id, []).append(result) - - if wikidata_ids: - user_language = 'en' if user_language == 'all' else user_language.split('-')[0] - wikidata_ids_str = " ".join(wikidata_ids) - query = wikidata_image_sparql.replace('%WIKIDATA_IDS%', sparql_string_escape(wikidata_ids_str)).replace( - '%LANGUAGE%', sparql_string_escape(user_language) - ) - wikidata_json = send_wikidata_query(query) - for wd_result in wikidata_json.get('results', {}).get('bindings', {}): - wd_id = wd_result['item']['value'].replace('http://www.wikidata.org/entity/', '') - for result in wd_to_results.get(wd_id, []): - result['wikidata'] = { - 'itemLabel': wd_result['itemLabel']['value'], - 'image': get_wikipedia_image(wd_result.get('image', {}).get('value')), - 'image_sign': get_wikipedia_image(wd_result.get('sign', {}).get('value')), - 'image_symbol': get_wikipedia_image(wd_result.get('symbol', {}).get('value')), - } - # overwrite wikipedia link - wikipedia_name = wd_result.get('wikipediaName', {}).get('value') - if wikipedia_name: - result['extratags']['wikipedia'] = user_language + ':' + wikipedia_name - # get website if not already defined - website = wd_result.get('website', {}).get('value') - if ( - website - and not result['extratags'].get('contact:website') - and not result['extratags'].get('website') - ): - result['extratags']['contact:website'] = website - - -def get_title_address(result): - """Return title and address - - title may be None - """ - address_raw = result.get('address') - address_name = None - address = {} - - # get name - if ( - result['category'] == 'amenity' - or result['category'] == 'shop' - or result['category'] == 'tourism' - or result['category'] == 'leisure' - ): - if address_raw.get('address29'): - # https://github.com/osm-search/Nominatim/issues/1662 - address_name = address_raw.get('address29') - else: - address_name = address_raw.get(result['category']) - elif result['type'] in address_raw: - address_name = address_raw.get(result['type']) - - # add rest of adressdata, if something is already found - if address_name: - title = address_name - address.update( - { - 'name': address_name, - 'house_number': address_raw.get('house_number'), - 'road': address_raw.get('road'), - 'locality': address_raw.get( - 'city', address_raw.get('town', address_raw.get('village')) # noqa - ), # noqa - 'postcode': address_raw.get('postcode'), - 'country': address_raw.get('country'), - 'country_code': address_raw.get('country_code'), - } - ) - else: - title = result.get('display_name') - - return title, address - - -def get_url_osm_geojson(result): - """Get url, osm and geojson""" - osm_type = result.get('osm_type', result.get('type')) - if 'osm_id' not in result: - # see https://github.com/osm-search/Nominatim/issues/1521 - # query example: "EC1M 5RF London" - url = result_lat_lon_url.format(lat=result['lat'], lon=result['lon'], zoom=12) - osm = {} - else: - url = result_id_url.format(osm_type=osm_type, osm_id=result['osm_id']) - osm = {'type': osm_type, 'id': result['osm_id']} - - geojson = result.get('geojson') - # if no geojson is found and osm_type is a node, add geojson Point - if not geojson and osm_type == 'node': - geojson = {'type': 'Point', 'coordinates': [result['lon'], result['lat']]} - - return url, osm, geojson - - -def get_img_src(result): - """Get image URL from either wikidata or r['extratags']""" - # wikidata - img_src = None - if 'wikidata' in result: - img_src = result['wikidata']['image'] - if not img_src: - img_src = result['wikidata']['image_symbol'] - if not img_src: - img_src = result['wikidata']['image_sign'] - - # img_src - if not img_src and result.get('extratags', {}).get('image'): - img_src = result['extratags']['image'] - del result['extratags']['image'] - if not img_src and result.get('extratags', {}).get('wikimedia_commons'): - img_src = get_external_url('wikimedia_image', result['extratags']['wikimedia_commons']) - del result['extratags']['wikimedia_commons'] - - return img_src - - -def get_links(result, user_language): - """Return links from result['extratags']""" - links = [] - link_keys = set() - for k, mapping_function in VALUE_TO_LINK.items(): - raw_value = result['extratags'].get(k) - if raw_value: - url, url_label = mapping_function(raw_value) - if url.startswith('https://wikidata.org'): - url_label = result.get('wikidata', {}).get('itemLabel') or url_label - links.append( - { - 'label': get_key_label(k, user_language), - 'url': url, - 'url_label': url_label, - } - ) - link_keys.add(k) - return links, link_keys - - -def get_data(result, user_language, ignore_keys): - """Return key, value of result['extratags'] - - Must be call after get_links - - Note: the values are not translated - """ - data = [] - for k, v in result['extratags'].items(): - if k in ignore_keys: - continue - if get_key_rank(k) is None: - continue - k_label = get_key_label(k, user_language) - if k_label: - data.append( - { - 'label': k_label, - 'key': k, - 'value': v, - } - ) - data.sort(key=lambda entry: (get_key_rank(entry['key']), entry['label'])) - return data - - -def get_key_rank(k): - """Get OSM key rank - - The rank defines in which order the key are displayed in the HTML result - """ - key_rank = KEY_RANKS.get(k) - if key_rank is None: - # "payment:*" in KEY_ORDER matches "payment:cash", "payment:debit card", etc... - key_rank = KEY_RANKS.get(k.split(':')[0] + ':*') - return key_rank - - -def get_label(labels, lang): - """Get label from labels in OSM_KEYS_TAGS - - in OSM_KEYS_TAGS, labels have key == '*' - """ - tag_label = labels.get(lang.lower()) - if tag_label is None: - # example: if 'zh-hk' is not found, check 'zh' - tag_label = labels.get(lang.split('-')[0]) - if tag_label is None and lang != 'en': - # example: if 'zh' is not found, check 'en' - tag_label = labels.get('en') - if tag_label is None and len(labels.values()) > 0: - # example: if still not found, use the first entry - tag_label = labels.values()[0] - return tag_label - - -def get_tag_label(tag_category, tag_name, lang): - """Get tag label from OSM_KEYS_TAGS""" - tag_name = '' if tag_name is None else tag_name - tag_labels = OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {}) - return get_label(tag_labels, lang) - - -def get_key_label(key_name, lang): - """Get key label from OSM_KEYS_TAGS""" - if key_name.startswith('currency:'): - # currency:EUR --> get the name from the CURRENCIES variable - # see https://wiki.openstreetmap.org/wiki/Key%3Acurrency - # and for exampe https://taginfo.openstreetmap.org/keys/currency:EUR#values - # but there is also currency=EUR (currently not handled) - # https://taginfo.openstreetmap.org/keys/currency#values - currency = key_name.split(':') - if len(currency) > 1: - o = CURRENCIES['iso4217'].get(currency) - if o: - return get_label(o, lang).lower() - return currency - - labels = OSM_KEYS_TAGS['keys'] - for k in key_name.split(':') + ['*']: - labels = labels.get(k) - if labels is None: - return None - return get_label(labels, lang) diff --git a/apps/searxng/searx/engines/openverse.py b/apps/searxng/searx/engines/openverse.py deleted file mode 100755 index 9f4636e..0000000 --- a/apps/searxng/searx/engines/openverse.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - - Openverse (formerly known as: Creative Commons search engine) [Images] - -""" - -from json import loads -from urllib.parse import urlencode - - -about = { - "website": 'https://wordpress.org/openverse/', - "wikidata_id": None, - "official_api_documentation": 'https://api.openverse.engineering/v1/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['images'] - -paging = True -nb_per_page = 20 - -base_url = 'https://api.openverse.engineering/v1/images/' -search_string = '?page={page}&page_size={nb_per_page}&format=json&{query}' - - -def request(query, params): - - search_path = search_string.format(query=urlencode({'q': query}), nb_per_page=nb_per_page, page=params['pageno']) - - params['url'] = base_url + search_path - - return params - - -def response(resp): - results = [] - - json_data = loads(resp.text) - - for result in json_data['results']: - results.append( - { - 'url': result['foreign_landing_url'], - 'title': result['title'], - 'img_src': result['url'], - 'template': 'images.html', - } - ) - - return results diff --git a/apps/searxng/searx/engines/pdbe.py b/apps/searxng/searx/engines/pdbe.py deleted file mode 100755 index 34c8d32..0000000 --- a/apps/searxng/searx/engines/pdbe.py +++ /dev/null @@ -1,122 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - PDBe (Protein Data Bank in Europe) -""" - -from json import loads -from flask_babel import gettext - -# about -about = { - "website": 'https://www.ebi.ac.uk/pdbe', - "wikidata_id": 'Q55823905', - "official_api_documentation": 'https://www.ebi.ac.uk/pdbe/api/doc/search.html', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['science'] - -hide_obsolete = False - -# status codes of unpublished entries -pdb_unpublished_codes = ['HPUB', 'HOLD', 'PROC', 'WAIT', 'AUTH', 'AUCO', 'REPL', 'POLC', 'REFI', 'TRSF', 'WDRN'] -# url for api query -pdbe_solr_url = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?' -# base url for results -pdbe_entry_url = 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}' -# link to preview image of structure -pdbe_preview_url = 'https://www.ebi.ac.uk/pdbe/static/entry/{pdb_id}_deposited_chain_front_image-200x200.png' - - -def request(query, params): - - params['url'] = pdbe_solr_url - params['method'] = 'POST' - params['data'] = {'q': query, 'wt': "json"} # request response in parsable format - return params - - -def construct_body(result): - # set title - title = result['title'] - - # construct content body - content = """{title} - {authors} {journal} ({volume}) {page} ({year})""" - - # replace placeholders with actual content - try: - if result['journal']: - content = content.format( - title=result['citation_title'], - authors=result['entry_author_list'][0], - journal=result['journal'], - volume=result['journal_volume'], - page=result['journal_page'], - year=result['citation_year'], - ) - else: - content = content.format( - title=result['citation_title'], - authors=result['entry_author_list'][0], - journal='', - volume='', - page='', - year=result['release_year'], - ) - img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) - except (KeyError): - content = None - img_src = None - - # construct url for preview image - try: - img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) - except (KeyError): - img_src = None - - return [title, content, img_src] - - -def response(resp): - - results = [] - json = loads(resp.text)['response']['docs'] - - # parse results - for result in json: - # catch obsolete entries and mark them accordingly - if result['status'] in pdb_unpublished_codes: - continue - if hide_obsolete: - continue - if result['status'] == 'OBS': - # expand title to add some sort of warning message - title = gettext('{title} (OBSOLETE)').format(title=result['title']) - try: - superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by']) - except: - continue - - # since we can't construct a proper body from the response, we'll make up our own - msg_superseded = gettext("This entry has been superseded by") - content = '{msg_superseded}: {url} ({pdb_id})'.format( - msg_superseded=msg_superseded, url=superseded_url, pdb_id=result['superseded_by'] - ) - - # obsoleted entries don't have preview images - img_src = None - else: - title, content, img_src = construct_body(result) - - results.append( - { - 'url': pdbe_entry_url.format(pdb_id=result['pdb_id']), - 'title': title, - 'content': content, - 'img_src': img_src, - } - ) - - return results diff --git a/apps/searxng/searx/engines/peertube.py b/apps/searxng/searx/engines/peertube.py deleted file mode 100755 index d0eba6b..0000000 --- a/apps/searxng/searx/engines/peertube.py +++ /dev/null @@ -1,186 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Peertube and :py:obj:`SepiaSearch ` do share -(more or less) the same REST API and the schema of the JSON result is identical. - -""" - -import re -from urllib.parse import urlencode -from datetime import datetime -from dateutil.parser import parse -from dateutil.relativedelta import relativedelta - -import babel - -from searx.network import get # see https://github.com/searxng/searxng/issues/762 -from searx.locales import language_tag -from searx.utils import html_to_text -from searx.enginelib.traits import EngineTraits - -traits: EngineTraits - -about = { - # pylint: disable=line-too-long - "website": 'https://joinpeertube.org', - "wikidata_id": 'Q50938515', - "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ["videos"] -paging = True -base_url = "https://peer.tube" -"""Base URL of the Peertube instance. A list of instances is available at: - -- https://instances.joinpeertube.org/instances -""" - -time_range_support = True -time_range_table = { - 'day': relativedelta(), - 'week': relativedelta(weeks=-1), - 'month': relativedelta(months=-1), - 'year': relativedelta(years=-1), -} - -safesearch = True -safesearch_table = {0: 'both', 1: 'false', 2: 'false'} - - -def minute_to_hm(minute): - if isinstance(minute, int): - return "%d:%02d" % (divmod(minute, 60)) - return None - - -def request(query, params): - """Assemble request for the Peertube API""" - - if not query: - return False - - # eng_region = traits.get_region(params['searxng_locale'], 'en_US') - eng_lang = traits.get_language(params['searxng_locale'], None) - - params['url'] = ( - base_url.rstrip("/") - + "/api/v1/search/videos?" - + urlencode( - { - 'search': query, - 'searchTarget': 'search-index', # Vidiversum - 'resultType': 'videos', - 'start': (params['pageno'] - 1) * 10, - 'count': 10, - # -createdAt: sort by date ascending / createdAt: date descending - 'sort': '-match', # sort by *match descending* - 'nsfw': safesearch_table[params['safesearch']], - } - ) - ) - - if eng_lang is not None: - params['url'] += '&languageOneOf[]=' + eng_lang - params['url'] += '&boostLanguages[]=' + eng_lang - - if params['time_range'] in time_range_table: - time = datetime.now().date() + time_range_table[params['time_range']] - params['url'] += '&startDate=' + time.isoformat() - - return params - - -def response(resp): - return video_response(resp) - - -def video_response(resp): - """Parse video response from SepiaSearch and Peertube instances.""" - results = [] - - json_data = resp.json() - - if 'data' not in json_data: - return [] - - for result in json_data['data']: - metadata = [ - x - for x in [ - result.get('channel', {}).get('displayName'), - result.get('channel', {}).get('name') + '@' + result.get('channel', {}).get('host'), - ', '.join(result.get('tags', [])), - ] - if x - ] - - results.append( - { - 'url': result['url'], - 'title': result['name'], - 'content': html_to_text(result.get('description') or ''), - 'author': result.get('account', {}).get('displayName'), - 'length': minute_to_hm(result.get('duration')), - 'template': 'videos.html', - 'publishedDate': parse(result['publishedAt']), - 'iframe_src': result.get('embedUrl'), - 'thumbnail': result.get('thumbnailUrl') or result.get('previewUrl'), - 'metadata': ' | '.join(metadata), - } - ) - - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages from peertube's search-index source code. - - See videoLanguages_ in commit `8ed5c729 - Refactor and redesign client`_ - - .. _8ed5c729 - Refactor and redesign client: - https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729 - .. _videoLanguages: - https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729#3d8747f9a60695c367c70bb64efba8f403721fad_0_291 - """ - - resp = get( - 'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/components/Filters.vue', - # the response from search-index repository is very slow - timeout=60, - ) - - if not resp.ok: # type: ignore - print("ERROR: response from peertube is not OK.") - return - - js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL) # type: ignore - if not js_lang: - print("ERROR: can't determine languages from peertube") - return - - for lang in re.finditer(r"\{ id: '([a-z]+)', label:", js_lang.group(1)): - eng_tag = lang.group(1) - if eng_tag == 'oc': - # Occitanis not known by babel, its closest relative is Catalan - # but 'ca' is already in the list of engine_traits.languages --> - # 'oc' will be ignored. - continue - try: - sxng_tag = language_tag(babel.Locale.parse(eng_tag)) - except babel.UnknownLocaleError: - print("ERROR: %s is unknown by babel" % eng_tag) - continue - - conflict = engine_traits.languages.get(sxng_tag) - if conflict: - if conflict != eng_tag: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) - continue - engine_traits.languages[sxng_tag] = eng_tag - - engine_traits.languages['zh_Hans'] = 'zh' - engine_traits.languages['zh_Hant'] = 'zh' diff --git a/apps/searxng/searx/engines/photon.py b/apps/searxng/searx/engines/photon.py deleted file mode 100755 index 2ea3936..0000000 --- a/apps/searxng/searx/engines/photon.py +++ /dev/null @@ -1,143 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Photon (Map) -""" - -from json import loads -from urllib.parse import urlencode -from searx.utils import searx_useragent - -# about -about = { - "website": 'https://photon.komoot.io', - "wikidata_id": None, - "official_api_documentation": 'https://photon.komoot.io/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['map'] -paging = False -number_of_results = 10 - -# search-url -base_url = 'https://photon.komoot.io/' -search_string = 'api/?{query}&limit={limit}' -result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' - -# list of supported languages -supported_languages = ['de', 'en', 'fr', 'it'] - - -# do search-request -def request(query, params): - params['url'] = base_url + search_string.format(query=urlencode({'q': query}), limit=number_of_results) - - if params['language'] != 'all': - language = params['language'].split('_')[0] - if language in supported_languages: - params['url'] = params['url'] + "&lang=" + language - - # using searx User-Agent - params['headers']['User-Agent'] = searx_useragent() - - return params - - -# get response from search-request -def response(resp): - results = [] - json = loads(resp.text) - - # parse results - for r in json.get('features', {}): - - properties = r.get('properties') - - if not properties: - continue - - # get title - title = properties.get('name') - - # get osm-type - if properties.get('osm_type') == 'N': - osm_type = 'node' - elif properties.get('osm_type') == 'W': - osm_type = 'way' - elif properties.get('osm_type') == 'R': - osm_type = 'relation' - else: - # continue if invalid osm-type - continue - - url = result_base_url.format(osm_type=osm_type, osm_id=properties.get('osm_id')) - - osm = {'type': osm_type, 'id': properties.get('osm_id')} - - geojson = r.get('geometry') - - if properties.get('extent'): - boundingbox = [ - properties.get('extent')[3], - properties.get('extent')[1], - properties.get('extent')[0], - properties.get('extent')[2], - ] - else: - # TODO: better boundingbox calculation - boundingbox = [ - geojson['coordinates'][1], - geojson['coordinates'][1], - geojson['coordinates'][0], - geojson['coordinates'][0], - ] - - # address calculation - address = {} - - # get name - if ( - properties.get('osm_key') == 'amenity' - or properties.get('osm_key') == 'shop' - or properties.get('osm_key') == 'tourism' - or properties.get('osm_key') == 'leisure' - ): - address = {'name': properties.get('name')} - - # add rest of adressdata, if something is already found - if address.get('name'): - address.update( - { - 'house_number': properties.get('housenumber'), - 'road': properties.get('street'), - 'locality': properties.get( - 'city', properties.get('town', properties.get('village')) # noqa - ), # noqa - 'postcode': properties.get('postcode'), - 'country': properties.get('country'), - } - ) - else: - address = None - - # append result - results.append( - { - 'template': 'map.html', - 'title': title, - 'content': '', - 'longitude': geojson['coordinates'][0], - 'latitude': geojson['coordinates'][1], - 'boundingbox': boundingbox, - 'geojson': geojson, - 'address': address, - 'osm': osm, - 'url': url, - } - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/piped.py b/apps/searxng/searx/engines/piped.py deleted file mode 100755 index 2bfb906..0000000 --- a/apps/searxng/searx/engines/piped.py +++ /dev/null @@ -1,165 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""An alternative privacy-friendly YouTube frontend which is efficient by -design. `Piped’s architecture`_ consists of 3 components: - -- :py:obj:`backend ` -- :py:obj:`frontend ` -- proxy - -.. _Piped’s architecture: https://docs.piped.video/docs/architecture/ - -Configuration -============= - -The :py:obj:`backend_url` and :py:obj:`frontend_url` has to be set in the engine -named `piped` and are used by all piped engines - -.. code:: yaml - - - name: piped - engine: piped - piped_filter: videos - ... - frontend_url: https://.. - backend_url: - - https://.. - - https://.. - - - name: piped.music - engine: piped - network: piped - shortcut: ppdm - piped_filter: music_songs - ... - -Known Quirks -============ - -The implementation to support :py:obj:`paging ` -is based on the *nextpage* method of Piped's REST API / the :py:obj:`frontend -API `. This feature is *next page driven* and plays well with the -:ref:`infinite_scroll ` setting in SearXNG but it does not really -fit into SearXNG's UI to select a page by number. - -Implementations -=============== -""" - -from __future__ import annotations - -import time -import random -from urllib.parse import urlencode -import datetime -from dateutil import parser - -# about -about = { - "website": 'https://github.com/TeamPiped/Piped/', - "wikidata_id": 'Q107565255', - "official_api_documentation": 'https://docs.piped.video/docs/api-documentation/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = [] -paging = True - -# search-url -backend_url: list | str = "https://pipedapi.kavin.rocks" -"""Piped-Backend_: The core component behind Piped. The value is an URL or a -list of URLs. In the latter case instance will be selected randomly. For a -complete list of offical instances see Piped-Instances (`JSON -`__) - -.. _Piped-Instances: https://github.com/TeamPiped/Piped/wiki/Instances -.. _Piped-Backend: https://github.com/TeamPiped/Piped-Backend - -""" - -frontend_url: str = "https://piped.video" -"""Piped-Frontend_: URL to use as link and for embeds. - -.. _Piped-Frontend: https://github.com/TeamPiped/Piped -""" - -piped_filter = 'all' -"""Content filter ``music_songs`` or ``videos``""" - - -def _backend_url() -> str: - from searx.engines import engines # pylint: disable=import-outside-toplevel - - url = engines['piped'].backend_url # type: ignore - if isinstance(url, list): - url = random.choice(url) - return url - - -def _frontend_url() -> str: - from searx.engines import engines # pylint: disable=import-outside-toplevel - - return engines['piped'].frontend_url # type: ignore - - -def request(query, params): - - args = { - 'q': query, - 'filter': piped_filter, - } - - path = "/search" - if params['pageno'] > 1: - # don't use nextpage when user selected to jump back to page 1 - nextpage = params['engine_data'].get('nextpage') - if nextpage: - path = "/nextpage/search" - args['nextpage'] = nextpage - - params["url"] = _backend_url() + f"{path}?" + urlencode(args) - return params - - -def response(resp): - results = [] - - json = resp.json() - - for result in json["items"]: - publishedDate = parser.parse(time.ctime(result.get("uploaded", 0) / 1000)) - - item = { - # the api url differs from the frontend, hence use piped.video as default - "url": _frontend_url() + result.get("url", ""), - "title": result.get("title", ""), - "publishedDate": publishedDate, - "iframe_src": _frontend_url() + '/embed' + result.get("url", ""), - } - - if piped_filter == 'videos': - item["template"] = "videos.html" - # if the value of shortDescription set, but is None, return empty string - item["content"] = result.get("shortDescription", "") or "" - item["thumbnail"] = result.get("thumbnail", "") - - elif piped_filter == 'music_songs': - item["template"] = "default.html" - item["img_src"] = result.get("thumbnail", "") - item["content"] = result.get("uploaderName", "") or "" - length = result.get("duration") - if length: - item["length"] = datetime.timedelta(seconds=length) - - results.append(item) - - results.append( - { - "engine_data": json["nextpage"], - "key": "nextpage", - } - ) - return results diff --git a/apps/searxng/searx/engines/piratebay.py b/apps/searxng/searx/engines/piratebay.py deleted file mode 100755 index 4b0984b..0000000 --- a/apps/searxng/searx/engines/piratebay.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Piratebay (Videos, Music, Files) -""" - -from json import loads -from datetime import datetime -from operator import itemgetter - -from urllib.parse import quote -from searx.utils import get_torrent_size - -# about -about = { - "website": 'https://thepiratebay.org', - "wikidata_id": 'Q22663', - "official_api_documentation": 'https://apibay.org/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ["files"] - -# search-url -url = "https://thepiratebay.org/" -search_url = "https://apibay.org/q.php?q={search_term}&cat={search_type}" - -# default trackers provided by thepiratebay -trackers = [ - "udp://tracker.coppersurfer.tk:6969/announce", - "udp://9.rarbg.to:2920/announce", - "udp://tracker.opentrackr.org:1337", - "udp://tracker.internetwarriors.net:1337/announce", - "udp://tracker.leechers-paradise.org:6969/announce", - "udp://tracker.coppersurfer.tk:6969/announce", - "udp://tracker.pirateparty.gr:6969/announce", - "udp://tracker.cyberia.is:6969/announce", -] - -# piratebay specific type-definitions -search_types = {"files": "0", "music": "100", "videos": "200"} - - -# do search-request -def request(query, params): - search_type = search_types.get(params["category"], "0") - - params["url"] = search_url.format(search_term=quote(query), search_type=search_type) - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_res = loads(resp.text) - - # return empty array if nothing is found - if search_res[0]["name"] == "No results returned": - return [] - - # parse results - for result in search_res: - link = url + "description.php?id=" + result["id"] - magnetlink = ( - "magnet:?xt=urn:btih:" + result["info_hash"] + "&dn=" + result["name"] + "&tr=" + "&tr=".join(trackers) - ) - - params = { - "url": link, - "title": result["name"], - "seed": result["seeders"], - "leech": result["leechers"], - "magnetlink": magnetlink, - "template": "torrent.html", - } - - # extract and convert creation date - try: - date = datetime.fromtimestamp(float(result["added"])) - params['publishedDate'] = date - except: - pass - - # let's try to calculate the torrent size - try: - filesize = get_torrent_size(result["size"], "B") - params['filesize'] = filesize - except: - pass - - # append result - results.append(params) - - # return results sorted by seeder - return sorted(results, key=itemgetter("seed"), reverse=True) diff --git a/apps/searxng/searx/engines/postgresql.py b/apps/searxng/searx/engines/postgresql.py deleted file mode 100755 index c027720..0000000 --- a/apps/searxng/searx/engines/postgresql.py +++ /dev/null @@ -1,89 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""PostgreSQL is a powerful and robust open source database. Before configuring -the PostgreSQL engine, you must install the dependency ``psychopg2``. - -Example -======= - -Below is an example configuration: - -.. code:: yaml - - - name: my_database - engine: postgresql - database: my_database - username: searxng - password: password - query_str: 'SELECT * from my_table WHERE my_column = %(query)s' - -Implementations -=============== - -""" - -try: - import psycopg2 # type: ignore -except ImportError: - # import error is ignored because the admin has to install postgresql - # manually to use the engine. - pass - -engine_type = 'offline' -host = "127.0.0.1" -port = "5432" -database = "" -username = "" -password = "" -query_str = "" -limit = 10 -paging = True -result_template = 'key-value.html' -_connection = None - - -def init(engine_settings): - global _connection # pylint: disable=global-statement - - if 'query_str' not in engine_settings: - raise ValueError('query_str cannot be empty') - - if not engine_settings['query_str'].lower().startswith('select '): - raise ValueError('only SELECT query is supported') - - _connection = psycopg2.connect( - database=database, - user=username, - password=password, - host=host, - port=port, - ) - - -def search(query, params): - query_params = {'query': query} - query_to_run = query_str + ' LIMIT {0} OFFSET {1}'.format(limit, (params['pageno'] - 1) * limit) - - with _connection: - with _connection.cursor() as cur: - cur.execute(query_to_run, query_params) - return _fetch_results(cur) - - -def _fetch_results(cur): - results = [] - titles = [] - - try: - titles = [column_desc.name for column_desc in cur.description] - - for res in cur: - result = dict(zip(titles, map(str, res))) - result['template'] = result_template - results.append(result) - - # no results to fetch - except psycopg2.ProgrammingError: - pass - - return results diff --git a/apps/searxng/searx/engines/pubmed.py b/apps/searxng/searx/engines/pubmed.py deleted file mode 100755 index 02e282d..0000000 --- a/apps/searxng/searx/engines/pubmed.py +++ /dev/null @@ -1,127 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - PubMed (Scholar publications) -""" - -from lxml import etree -from datetime import datetime -from urllib.parse import urlencode -from searx.network import get -from searx.utils import ( - eval_xpath_getindex, - eval_xpath_list, - extract_text, -) - -# about -about = { - "website": 'https://www.ncbi.nlm.nih.gov/pubmed/', - "wikidata_id": 'Q1540899', - "official_api_documentation": { - 'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/', - 'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/', - }, - "use_official_api": True, - "require_api_key": False, - "results": 'XML', -} - -categories = ['science', 'scientific publications'] - -base_url = ( - 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' -) - -# engine dependent config -number_of_results = 10 -pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/' - - -def request(query, params): - # basic search - offset = (params['pageno'] - 1) * number_of_results - - string_args = dict(query=urlencode({'term': query}), offset=offset, hits=number_of_results) - - params['url'] = base_url.format(**string_args) - - return params - - -def response(resp): - results = [] - - # First retrieve notice of each result - pubmed_retrieve_api_url = ( - 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}' - ) - - pmids_results = etree.XML(resp.content) - pmids = pmids_results.xpath('//eSearchResult/IdList/Id') - pmids_string = '' - - for item in pmids: - pmids_string += item.text + ',' - - retrieve_notice_args = dict(pmids_string=pmids_string) - - retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) - - search_results_response = get(retrieve_url_encoded).content - search_results = etree.XML(search_results_response) - for entry in eval_xpath_list(search_results, '//PubmedArticle'): - medline = eval_xpath_getindex(entry, './MedlineCitation', 0) - - title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text - pmid = eval_xpath_getindex(medline, './/PMID', 0).text - url = pubmed_url + pmid - content = extract_text( - eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True - ) - doi = extract_text( - eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True - ) - journal = extract_text( - eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True - ) - issn = extract_text( - eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True - ) - authors = [] - for author in eval_xpath_list(medline, './Article/AuthorList/Author'): - f = eval_xpath_getindex(author, './ForeName', 0, default=None) - l = eval_xpath_getindex(author, './LastName', 0, default=None) - f = '' if f is None else f.text - l = '' if l is None else l.text - authors.append((f + ' ' + l).strip()) - - res_dict = { - 'template': 'paper.html', - 'url': url, - 'title': title, - 'content': content, - 'journal': journal, - 'issn': [issn], - 'authors': authors, - 'doi': doi, - } - - accepted_date = eval_xpath_getindex( - entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None - ) - if accepted_date is not None: - year = eval_xpath_getindex(accepted_date, './Year', 0) - month = eval_xpath_getindex(accepted_date, './Month', 0) - day = eval_xpath_getindex(accepted_date, './Day', 0) - try: - publishedDate = datetime.strptime( - year.text + '-' + month.text + '-' + day.text, - '%Y-%m-%d', - ) - res_dict['publishedDate'] = publishedDate - except Exception as e: - print(e) - - results.append(res_dict) - - return results diff --git a/apps/searxng/searx/engines/qwant.py b/apps/searxng/searx/engines/qwant.py deleted file mode 100755 index 4a41676..0000000 --- a/apps/searxng/searx/engines/qwant.py +++ /dev/null @@ -1,284 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Qwant (Web, News, Images, Videos) - -This engine uses the Qwant API (https://api.qwant.com/v3). The API is -undocumented but can be reverse engineered by reading the network log of -https://www.qwant.com/ queries. - -This implementation is used by different qwant engines in the settings.yml:: - - - name: qwant - qwant_categ: web - ... - - name: qwant news - qwant_categ: news - ... - - name: qwant images - qwant_categ: images - ... - - name: qwant videos - qwant_categ: videos - ... - -""" - -from datetime import ( - datetime, - timedelta, -) -from json import loads -from urllib.parse import urlencode -from flask_babel import gettext -import babel - -from searx.exceptions import SearxEngineAPIException -from searx.network import raise_for_httperror -from searx.enginelib.traits import EngineTraits - -traits: EngineTraits - -# about -about = { - "website": 'https://www.qwant.com/', - "wikidata_id": 'Q14657870', - "official_api_documentation": None, - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = [] -paging = True -qwant_categ = None # web|news|inages|videos - -safesearch = True -safe_search_map = {0: '&safesearch=0', 1: '&safesearch=1', 2: '&safesearch=2'} - -# fmt: off -qwant_news_locales = [ - 'ca_ad', 'ca_es', 'ca_fr', 'co_fr', 'de_at', 'de_ch', 'de_de', 'en_au', - 'en_ca', 'en_gb', 'en_ie', 'en_my', 'en_nz', 'en_us', 'es_ad', 'es_ar', - 'es_cl', 'es_co', 'es_es', 'es_mx', 'es_pe', 'eu_es', 'eu_fr', 'fc_ca', - 'fr_ad', 'fr_be', 'fr_ca', 'fr_ch', 'fr_fr', 'it_ch', 'it_it', 'nl_be', - 'nl_nl', 'pt_ad', 'pt_pt', -] -# fmt: on - -# search-url -url = 'https://api.qwant.com/v3/search/{keyword}?{query}&count={count}&offset={offset}' - - -def request(query, params): - """Qwant search request""" - - if not query: - return None - - count = 10 # web: count must be equal to 10 - - if qwant_categ == 'images': - count = 50 - offset = (params['pageno'] - 1) * count - # count + offset must be lower than 250 - offset = min(offset, 199) - else: - offset = (params['pageno'] - 1) * count - # count + offset must be lower than 50 - offset = min(offset, 40) - - params['url'] = url.format( - keyword=qwant_categ, - query=urlencode({'q': query}), - offset=offset, - count=count, - ) - - # add quant's locale - q_locale = traits.get_region(params["searxng_locale"], default='en_US') - params['url'] += '&locale=' + q_locale - - # add safesearch option - params['url'] += safe_search_map.get(params['safesearch'], '') - - params['raise_for_httperror'] = False - return params - - -def response(resp): - """Get response from Qwant's search request""" - # pylint: disable=too-many-locals, too-many-branches, too-many-statements - - results = [] - - # load JSON result - search_results = loads(resp.text) - data = search_results.get('data', {}) - - # check for an API error - if search_results.get('status') != 'success': - msg = ",".join( - data.get( - 'message', - [ - 'unknown', - ], - ) - ) - raise SearxEngineAPIException('API error::' + msg) - - # raise for other errors - raise_for_httperror(resp) - - if qwant_categ == 'web': - # The WEB query contains a list named 'mainline'. This list can contain - # different result types (e.g. mainline[0]['type'] returns type of the - # result items in mainline[0]['items'] - mainline = data.get('result', {}).get('items', {}).get('mainline', {}) - else: - # Queries on News, Images and Videos do not have a list named 'mainline' - # in the response. The result items are directly in the list - # result['items']. - mainline = data.get('result', {}).get('items', []) - mainline = [ - {'type': qwant_categ, 'items': mainline}, - ] - - # return empty array if there are no results - if not mainline: - return [] - - for row in mainline: - - mainline_type = row.get('type', 'web') - if mainline_type != qwant_categ: - continue - - if mainline_type == 'ads': - # ignore adds - continue - - mainline_items = row.get('items', []) - for item in mainline_items: - - title = item.get('title', None) - res_url = item.get('url', None) - - if mainline_type == 'web': - content = item['desc'] - results.append( - { - 'title': title, - 'url': res_url, - 'content': content, - } - ) - - elif mainline_type == 'news': - - pub_date = item['date'] - if pub_date is not None: - pub_date = datetime.fromtimestamp(pub_date) - news_media = item.get('media', []) - img_src = None - if news_media: - img_src = news_media[0].get('pict', {}).get('url', None) - results.append( - { - 'title': title, - 'url': res_url, - 'publishedDate': pub_date, - 'img_src': img_src, - } - ) - - elif mainline_type == 'images': - thumbnail = item['thumbnail'] - img_src = item['media'] - results.append( - { - 'title': title, - 'url': res_url, - 'template': 'images.html', - 'thumbnail_src': thumbnail, - 'img_src': img_src, - } - ) - - elif mainline_type == 'videos': - # some videos do not have a description: while qwant-video - # returns an empty string, such video from a qwant-web query - # miss the 'desc' key. - d, s, c = item.get('desc'), item.get('source'), item.get('channel') - content_parts = [] - if d: - content_parts.append(d) - if s: - content_parts.append("%s: %s " % (gettext("Source"), s)) - if c: - content_parts.append("%s: %s " % (gettext("Channel"), c)) - content = ' // '.join(content_parts) - length = item['duration'] - if length is not None: - length = timedelta(milliseconds=length) - pub_date = item['date'] - if pub_date is not None: - pub_date = datetime.fromtimestamp(pub_date) - thumbnail = item['thumbnail'] - # from some locations (DE and others?) the s2 link do - # response a 'Please wait ..' but does not deliver the thumbnail - thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1) - results.append( - { - 'title': title, - 'url': res_url, - 'content': content, - 'publishedDate': pub_date, - 'thumbnail': thumbnail, - 'template': 'videos.html', - 'length': length, - } - ) - - return results - - -def fetch_traits(engine_traits: EngineTraits): - - # pylint: disable=import-outside-toplevel - from searx import network - from searx.locales import region_tag - - resp = network.get(about['website']) - text = resp.text - text = text[text.find('INITIAL_PROPS') :] - text = text[text.find('{') : text.find('')] - - q_initial_props = loads(text) - q_locales = q_initial_props.get('locales') - eng_tag_list = set() - - for country, v in q_locales.items(): - for lang in v['langs']: - _locale = "{lang}_{country}".format(lang=lang, country=country) - - if qwant_categ == 'news' and _locale.lower() not in qwant_news_locales: - # qwant-news does not support all locales from qwant-web: - continue - - eng_tag_list.add(_locale) - - for eng_tag in eng_tag_list: - try: - sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_')) - except babel.UnknownLocaleError: - print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag) - continue - - conflict = engine_traits.regions.get(sxng_tag) - if conflict: - if conflict != eng_tag: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) - continue - engine_traits.regions[sxng_tag] = eng_tag diff --git a/apps/searxng/searx/engines/recoll.py b/apps/searxng/searx/engines/recoll.py deleted file mode 100755 index c11e197..0000000 --- a/apps/searxng/searx/engines/recoll.py +++ /dev/null @@ -1,144 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. sidebar:: info - - - `Recoll `_ - - `recoll-webui `_ - - :origin:`searx/engines/recoll.py` - -Recoll_ is a desktop full-text search tool based on Xapian. By itself Recoll_ -does not offer WEB or API access, this can be achieved using recoll-webui_ - -Configuration -============= - -You must configure the following settings: - -``base_url``: - Location where recoll-webui can be reached. - -``mount_prefix``: - Location where the file hierarchy is mounted on your *local* filesystem. - -``dl_prefix``: - Location where the file hierarchy as indexed by recoll can be reached. - -``search_dir``: - Part of the indexed file hierarchy to be search, if empty the full domain is - searched. - -Example -======= - -Scenario: - -#. Recoll indexes a local filesystem mounted in ``/export/documents/reference``, -#. the Recoll search interface can be reached at https://recoll.example.org/ and -#. the contents of this filesystem can be reached though https://download.example.org/reference - -.. code:: yaml - - base_url: https://recoll.example.org/ - mount_prefix: /export/documents - dl_prefix: https://download.example.org - search_dir: '' - -Implementations -=============== - -""" - -from datetime import date, timedelta -from json import loads -from urllib.parse import urlencode, quote - -# about -about = { - "website": None, - "wikidata_id": 'Q15735774', - "official_api_documentation": 'https://www.lesbonscomptes.com/recoll/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -paging = True -time_range_support = True - -# parameters from settings.yml -base_url = None -search_dir = '' -mount_prefix = None -dl_prefix = None - -# embedded -embedded_url = '<{ttype} controls height="166px" ' + 'src="{url}" type="{mtype}">' - - -# helper functions -def get_time_range(time_range): - sw = {'day': 1, 'week': 7, 'month': 30, 'year': 365} # pylint: disable=invalid-name - - offset = sw.get(time_range, 0) - if not offset: - return '' - - return (date.today() - timedelta(days=offset)).isoformat() - - -# do search-request -def request(query, params): - search_after = get_time_range(params['time_range']) - search_url = base_url + 'json?{query}&highlight=0' - params['url'] = search_url.format( - query=urlencode({'query': query, 'page': params['pageno'], 'after': search_after, 'dir': search_dir}) - ) - - return params - - -# get response from search-request -def response(resp): - results = [] - - response_json = loads(resp.text) - - if not response_json: - return [] - - for result in response_json.get('results', []): - title = result['label'] - url = result['url'].replace('file://' + mount_prefix, dl_prefix) - content = '{}'.format(result['snippet']) - - # append result - item = {'url': url, 'title': title, 'content': content, 'template': 'files.html'} - - if result['size']: - item['size'] = int(result['size']) - - for parameter in ['filename', 'abstract', 'author', 'mtype', 'time']: - if result[parameter]: - item[parameter] = result[parameter] - - # facilitate preview support for known mime types - if 'mtype' in result and '/' in result['mtype']: - (mtype, subtype) = result['mtype'].split('/') - item['mtype'] = mtype - item['subtype'] = subtype - - if mtype in ['audio', 'video']: - item['embedded'] = embedded_url.format( - ttype=mtype, url=quote(url.encode('utf8'), '/:'), mtype=result['mtype'] - ) - - if mtype in ['image'] and subtype in ['bmp', 'gif', 'jpeg', 'png']: - item['img_src'] = url - - results.append(item) - - if 'nres' in response_json: - results.append({'number_of_results': response_json['nres']}) - - return results diff --git a/apps/searxng/searx/engines/reddit.py b/apps/searxng/searx/engines/reddit.py deleted file mode 100755 index 36d9233..0000000 --- a/apps/searxng/searx/engines/reddit.py +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Reddit -""" - -import json -from datetime import datetime -from urllib.parse import urlencode, urljoin, urlparse - -# about -about = { - "website": 'https://www.reddit.com/', - "wikidata_id": 'Q1136', - "official_api_documentation": 'https://www.reddit.com/dev/api', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['social media'] -page_size = 25 - -# search-url -base_url = 'https://www.reddit.com/' -search_url = base_url + 'search.json?{query}' - - -def request(query, params): - - query = urlencode({'q': query, 'limit': page_size}) - params['url'] = search_url.format(query=query) - - return params - - -def response(resp): - - img_results = [] - text_results = [] - - search_results = json.loads(resp.text) - - # return empty array if there are no results - if 'data' not in search_results: - return [] - - posts = search_results.get('data', {}).get('children', []) - - # process results - for post in posts: - data = post['data'] - - # extract post information - params = {'url': urljoin(base_url, data['permalink']), 'title': data['title']} - - # if thumbnail field contains a valid URL, we need to change template - thumbnail = data['thumbnail'] - url_info = urlparse(thumbnail) - # netloc & path - if url_info[1] != '' and url_info[2] != '': - params['img_src'] = data['url'] - params['thumbnail_src'] = thumbnail - params['template'] = 'images.html' - img_results.append(params) - else: - created = datetime.fromtimestamp(data['created_utc']) - content = data['selftext'] - if len(content) > 500: - content = content[:500] + '...' - params['content'] = content - params['publishedDate'] = created - text_results.append(params) - - # show images first and text results second - return img_results + text_results diff --git a/apps/searxng/searx/engines/redis_server.py b/apps/searxng/searx/engines/redis_server.py deleted file mode 100755 index 9808125..0000000 --- a/apps/searxng/searx/engines/redis_server.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Redis is an open source (BSD licensed), in-memory data structure (key value -based) store. Before configuring the ``redis_server`` engine, you must install -the dependency redis_. - -Configuration -============= - -Select a database to search in and set its index in the option ``db``. You can -either look for exact matches or use partial keywords to find what you are -looking for by configuring ``exact_match_only``. - -Example -======= - -Below is an example configuration: - -.. code:: yaml - - # Required dependency: redis - - - name: myredis - shortcut : rds - engine: redis_server - exact_match_only: false - host: '127.0.0.1' - port: 6379 - enable_http: true - password: '' - db: 0 - -Implementations -=============== - -""" - -import redis # pylint: disable=import-error - -engine_type = 'offline' - -# redis connection variables -host = '127.0.0.1' -port = 6379 -password = '' -db = 0 - -# engine specific variables -paging = False -result_template = 'key-value.html' -exact_match_only = True - -_redis_client = None - - -def init(_engine_settings): - global _redis_client # pylint: disable=global-statement - _redis_client = redis.StrictRedis( - host=host, - port=port, - db=db, - password=password or None, - decode_responses=True, - ) - - -def search(query, _params): - if not exact_match_only: - return search_keys(query) - - ret = _redis_client.hgetall(query) - if ret: - ret['template'] = result_template - return [ret] - - if ' ' in query: - qset, rest = query.split(' ', 1) - ret = [] - for res in _redis_client.hscan_iter(qset, match='*{}*'.format(rest)): - ret.append( - { - res[0]: res[1], - 'template': result_template, - } - ) - return ret - return [] - - -def search_keys(query): - ret = [] - for key in _redis_client.scan_iter(match='*{}*'.format(query)): - key_type = _redis_client.type(key) - res = None - - if key_type == 'hash': - res = _redis_client.hgetall(key) - elif key_type == 'list': - res = dict(enumerate(_redis_client.lrange(key, 0, -1))) - - if res: - res['template'] = result_template - res['redis_key'] = key - ret.append(res) - return ret diff --git a/apps/searxng/searx/engines/rumble.py b/apps/searxng/searx/engines/rumble.py deleted file mode 100755 index beca257..0000000 --- a/apps/searxng/searx/engines/rumble.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Rumble (Videos) -""" -from urllib.parse import urlencode -from lxml import html -from datetime import datetime - -# about -from searx.utils import extract_text - -about = { - "website": 'https://rumble.com/', - "wikidata_id": 'Q104765127', - "official_api_documentation": 'https://help.rumble.com/', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['videos'] -paging = True - -# search-url -base_url = 'https://rumble.com' -# https://rumble.com/search/video?q=searx&page=3 -search_url = base_url + '/search/video?{query}&page={pageno}' - -url_xpath = './/a[@class="video-item--a"]/@href' -thumbnail_xpath = './/img[@class="video-item--img"]/@src' -title_xpath = './/h3[@class="video-item--title"]' -published_date = './/time[@class="video-item--meta video-item--time"]/@datetime' -earned_xpath = './/span[@class="video-item--meta video-item--earned"]/@data-value' -views_xpath = './/span[@class="video-item--meta video-item--views"]/@data-value' -rumbles_xpath = './/span[@class="video-item--meta video-item--rumbles"]/@data-value' -author_xpath = './/div[@class="ellipsis-1"]' -length_xpath = './/span[@class="video-item--duration"]/@data-value' - - -def request(query, params): - params['url'] = search_url.format(pageno=params['pageno'], query=urlencode({'q': query})) - return params - - -def response(resp): - results = [] - dom = html.fromstring(resp.text) - results_dom = dom.xpath('//li[contains(@class, "video-listing-entry")]') - - if not results_dom: - return [] - - for result_dom in results_dom: - url = base_url + extract_text(result_dom.xpath(url_xpath)) - thumbnail = extract_text(result_dom.xpath(thumbnail_xpath)) - title = extract_text(result_dom.xpath(title_xpath)) - p_date = extract_text(result_dom.xpath(published_date)) - # fix offset date for line 644 webapp.py check - fixed_date = datetime.strptime(p_date, '%Y-%m-%dT%H:%M:%S%z') - earned = extract_text(result_dom.xpath(earned_xpath)) - views = extract_text(result_dom.xpath(views_xpath)) - rumbles = extract_text(result_dom.xpath(rumbles_xpath)) - author = extract_text(result_dom.xpath(author_xpath)) - length = extract_text(result_dom.xpath(length_xpath)) - if earned: - content = f"{views} views - {rumbles} rumbles - ${earned}" - else: - content = f"{views} views - {rumbles} rumbles" - - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'author': author, - 'length': length, - 'template': 'videos.html', - 'publishedDate': fixed_date, - 'thumbnail': thumbnail, - } - ) - return results diff --git a/apps/searxng/searx/engines/scanr_structures.py b/apps/searxng/searx/engines/scanr_structures.py deleted file mode 100755 index ad27079..0000000 --- a/apps/searxng/searx/engines/scanr_structures.py +++ /dev/null @@ -1,87 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - ScanR Structures (Science) -""" - -from json import loads, dumps -from searx.utils import html_to_text - -# about -about = { - "website": 'https://scanr.enseignementsup-recherche.gouv.fr', - "wikidata_id": 'Q44105684', - "official_api_documentation": 'https://scanr.enseignementsup-recherche.gouv.fr/opendata', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['science'] -paging = True -page_size = 20 - -# search-url -url = 'https://scanr.enseignementsup-recherche.gouv.fr/' -search_url = url + 'api/structures/search' - - -# do search-request -def request(query, params): - - params['url'] = search_url - params['method'] = 'POST' - params['headers']['Content-type'] = "application/json" - params['data'] = dumps( - { - "query": query, - "searchField": "ALL", - "sortDirection": "ASC", - "sortOrder": "RELEVANCY", - "page": params['pageno'], - "pageSize": page_size, - } - ) - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_res = loads(resp.text) - - # return empty array if there are no results - if search_res.get('total', 0) < 1: - return [] - - # parse results - for result in search_res['results']: - if 'id' not in result: - continue - - # is it thumbnail or img_src?? - thumbnail = None - if 'logo' in result: - thumbnail = result['logo'] - if thumbnail[0] == '/': - thumbnail = url + thumbnail - - content = None - if 'highlights' in result: - content = result['highlights'][0]['value'] - - # append result - results.append( - { - 'url': url + 'structure/' + result['id'], - 'title': result['label'], - # 'thumbnail': thumbnail, - 'img_src': thumbnail, - 'content': html_to_text(content), - } - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/searchcode_code.py b/apps/searxng/searx/engines/searchcode_code.py deleted file mode 100755 index a4b0308..0000000 --- a/apps/searxng/searx/engines/searchcode_code.py +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Searchcode (IT) -""" - -from json import loads -from urllib.parse import urlencode - -# about -about = { - "website": 'https://searchcode.com/', - "wikidata_id": None, - "official_api_documentation": 'https://searchcode.com/api/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['it'] -paging = True - -# search-url -url = 'https://searchcode.com/' -search_url = url + 'api/codesearch_I/?{query}&p={pageno}' - -# special code-endings which are not recognised by the file ending -code_endings = {'cs': 'c#', 'h': 'c', 'hpp': 'cpp', 'cxx': 'cpp'} - - -# do search-request -def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_results = loads(resp.text) - - # parse results - for result in search_results.get('results', []): - href = result['url'] - title = "" + result['name'] + " - " + result['filename'] - repo = result['repo'] - - lines = dict() - for line, code in result['lines'].items(): - lines[int(line)] = code - - code_language = code_endings.get( - result['filename'].split('.')[-1].lower(), result['filename'].split('.')[-1].lower() - ) - - # append result - results.append( - { - 'url': href, - 'title': title, - 'content': '', - 'repository': repo, - 'codelines': sorted(lines.items()), - 'code_language': code_language, - 'template': 'code.html', - } - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/searx_engine.py b/apps/searxng/searx/engines/searx_engine.py deleted file mode 100755 index 84a8e64..0000000 --- a/apps/searxng/searx/engines/searx_engine.py +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Searx (all) -""" - -from json import loads -from searx.engines import categories as searx_categories - -# about -about = { - "website": 'https://github.com/searxng/searxng', - "wikidata_id": 'Q17639196', - "official_api_documentation": 'https://docs.searxng.org/dev/search_api.html', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = searx_categories.keys() - -# search-url -instance_urls = [] -instance_index = 0 - - -# do search-request -def request(query, params): - global instance_index - params['url'] = instance_urls[instance_index % len(instance_urls)] - params['method'] = 'POST' - - instance_index += 1 - - params['data'] = { - 'q': query, - 'pageno': params['pageno'], - 'language': params['language'], - 'time_range': params['time_range'], - 'category': params['category'], - 'format': 'json', - } - - return params - - -# get response from search-request -def response(resp): - - response_json = loads(resp.text) - results = response_json['results'] - - for i in ('answers', 'infoboxes'): - results.extend(response_json[i]) - - results.extend({'suggestion': s} for s in response_json['suggestions']) - - results.append({'number_of_results': response_json['number_of_results']}) - - return results diff --git a/apps/searxng/searx/engines/semantic_scholar.py b/apps/searxng/searx/engines/semantic_scholar.py deleted file mode 100755 index 7a1b5b2..0000000 --- a/apps/searxng/searx/engines/semantic_scholar.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Semantic Scholar (Science) -""" - -from json import dumps, loads -from datetime import datetime - -from flask_babel import gettext - -about = { - "website": 'https://www.semanticscholar.org/', - "wikidata_id": 'Q22908627', - "official_api_documentation": 'https://api.semanticscholar.org/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['science', 'scientific publications'] -paging = True -search_url = 'https://www.semanticscholar.org/api/1/search' -paper_url = 'https://www.semanticscholar.org/paper' - - -def request(query, params): - params['url'] = search_url - params['method'] = 'POST' - params['headers']['content-type'] = 'application/json' - params['data'] = dumps( - { - "queryString": query, - "page": params['pageno'], - "pageSize": 10, - "sort": "relevance", - "useFallbackRankerService": False, - "useFallbackSearchCluster": False, - "getQuerySuggestions": False, - "authors": [], - "coAuthors": [], - "venues": [], - "performTitleMatch": True, - } - ) - return params - - -def response(resp): - res = loads(resp.text) - results = [] - for result in res['results']: - url = result.get('primaryPaperLink', {}).get('url') - if not url and result.get('links'): - url = result.get('links')[0] - if not url: - alternatePaperLinks = result.get('alternatePaperLinks') - if alternatePaperLinks: - url = alternatePaperLinks[0].get('url') - if not url: - url = paper_url + '/%s' % result['id'] - - # publishedDate - if 'pubDate' in result: - publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d") - else: - publishedDate = None - - # authors - authors = [author[0]['name'] for author in result.get('authors', [])] - - # pick for the first alternate link, but not from the crawler - pdf_url = None - for doc in result.get('alternatePaperLinks', []): - if doc['linkType'] not in ('crawler', 'doi'): - pdf_url = doc['url'] - break - - # comments - comments = None - if 'citationStats' in result: - comments = gettext( - '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}' - ).format( - numCitations=result['citationStats']['numCitations'], - firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'], - lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'], - ) - - results.append( - { - 'template': 'paper.html', - 'url': url, - 'title': result['title']['text'], - 'content': result['paperAbstract']['text'], - 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'), - 'doi': result.get('doiInfo', {}).get('doi'), - 'tags': result.get('fieldsOfStudy'), - 'authors': authors, - 'pdf_url': pdf_url, - 'publishedDate': publishedDate, - 'comments': comments, - } - ) - - return results diff --git a/apps/searxng/searx/engines/sepiasearch.py b/apps/searxng/searx/engines/sepiasearch.py deleted file mode 100755 index 72157b2..0000000 --- a/apps/searxng/searx/engines/sepiasearch.py +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""SepiaSearch uses the same languages as :py:obj:`Peertube -` and the response is identical to the response from the -peertube engines. - -""" - -from typing import TYPE_CHECKING - -from urllib.parse import urlencode -from datetime import datetime - -from searx.engines.peertube import fetch_traits # pylint: disable=unused-import -from searx.engines.peertube import ( - # pylint: disable=unused-import - video_response, - safesearch_table, - time_range_table, -) -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -about = { - # pylint: disable=line-too-long - "website": 'https://sepiasearch.org', - "wikidata_id": None, - "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['videos'] -paging = True - -base_url = 'https://sepiasearch.org' - -time_range_support = True -safesearch = True - - -def request(query, params): - """Assemble request for the SepiaSearch API""" - - if not query: - return False - - # eng_region = traits.get_region(params['searxng_locale'], 'en_US') - eng_lang = traits.get_language(params['searxng_locale'], None) - - params['url'] = ( - base_url.rstrip("/") - + "/api/v1/search/videos?" - + urlencode( - { - 'search': query, - 'start': (params['pageno'] - 1) * 10, - 'count': 10, - # -createdAt: sort by date ascending / createdAt: date descending - 'sort': '-match', # sort by *match descending* - 'nsfw': safesearch_table[params['safesearch']], - } - ) - ) - - if eng_lang is not None: - params['url'] += '&languageOneOf[]=' + eng_lang - params['url'] += '&boostLanguages[]=' + eng_lang - - if params['time_range'] in time_range_table: - time = datetime.now().date() + time_range_table[params['time_range']] - params['url'] += '&startDate=' + time.isoformat() - - return params - - -def response(resp): - return video_response(resp) diff --git a/apps/searxng/searx/engines/seznam.py b/apps/searxng/searx/engines/seznam.py deleted file mode 100755 index 36a3884..0000000 --- a/apps/searxng/searx/engines/seznam.py +++ /dev/null @@ -1,74 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Seznam - -""" - -from urllib.parse import urlencode -from lxml import html -from searx.network import get -from searx.exceptions import SearxEngineAccessDeniedException -from searx.utils import ( - extract_text, - eval_xpath_list, - eval_xpath_getindex, -) - -# about -about = { - "website": "https://www.seznam.cz/", - "wikidata_id": "Q3490485", - "official_api_documentation": "https://api.sklik.cz/", - "use_official_api": False, - "require_api_key": False, - "results": "HTML", - "language": "cz", -} - -categories = ['general', 'web'] -base_url = 'https://search.seznam.cz/' - - -def request(query, params): - response_index = get(base_url, headers=params['headers'], raise_for_httperror=True) - dom = html.fromstring(response_index.text) - - url_params = { - 'q': query, - 'oq': query, - } - for e in eval_xpath_list(dom, '//input[@type="hidden"]'): - name = e.get('name') - value = e.get('value') - url_params[name] = value - - params['url'] = base_url + '?' + urlencode(url_params) - params['cookies'] = response_index.cookies - return params - - -def response(resp): - if resp.url.path.startswith('/verify'): - raise SearxEngineAccessDeniedException() - - results = [] - - dom = html.fromstring(resp.content.decode()) - for result_element in eval_xpath_list( - dom, '//div[@id="searchpage-root"]//div[@class="Layout--left"]/div[@class="f2c528"]' - ): - result_data = eval_xpath_getindex( - result_element, './/div[@class="c8774a" or @class="e69e8d a11657"]', 0, default=None - ) - if result_data is None: - continue - title_element = eval_xpath_getindex(result_element, './/h3/a', 0) - results.append( - { - 'url': title_element.get('href'), - 'title': extract_text(title_element), - 'content': extract_text(result_data), - } - ) - - return results diff --git a/apps/searxng/searx/engines/sjp.py b/apps/searxng/searx/engines/sjp.py deleted file mode 100755 index 6daa46e..0000000 --- a/apps/searxng/searx/engines/sjp.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Słownik Języka Polskiego - -Dictionary of the polish language from PWN (sjp.pwn) -""" - -from lxml.html import fromstring -from searx import logger -from searx.utils import extract_text -from searx.network import raise_for_httperror - -logger = logger.getChild('sjp engine') - -# about -about = { - "website": 'https://sjp.pwn.pl', - "wikidata_id": 'Q55117369', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', - "language": 'pl', -} - -categories = ['dictionaries'] -paging = False - -URL = 'https://sjp.pwn.pl' -SEARCH_URL = URL + '/szukaj/{query}.html' - -word_xpath = '//div[@class="query"]' -dict_xpath = [ - '//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]', - '//div[@class="wyniki sjp-wyniki sjp-anchor"]', - '//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]', -] - - -def request(query, params): - params['url'] = SEARCH_URL.format(query=query) - logger.debug(f"query_url --> {params['url']}") - return params - - -def response(resp): - results = [] - - raise_for_httperror(resp) - dom = fromstring(resp.text) - word = extract_text(dom.xpath(word_xpath)) - - definitions = [] - - for dict_src in dict_xpath: - for src in dom.xpath(dict_src): - src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip() - - src_defs = [] - for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'): - if def_item.xpath('./div[@class="znacz"]'): - sub_defs = [] - for def_sub_item in def_item.xpath('./div[@class="znacz"]'): - def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ') - sub_defs.append(def_sub_text) - src_defs.append((word, sub_defs)) - else: - def_text = extract_text(def_item).strip() - def_link = def_item.xpath('./span/a/@href') - if 'doroszewski' in def_link[0]: - def_text = f"{def_text}" - src_defs.append((def_text, '')) - - definitions.append((src_text, src_defs)) - - if not definitions: - return results - - infobox = '' - for src in definitions: - infobox += f"
{src[0]}" - infobox += "
    " - for (def_text, sub_def) in src[1]: - infobox += f"
  • {def_text}
  • " - if sub_def: - infobox += "
      " - for sub_def_text in sub_def: - infobox += f"
    1. {sub_def_text}
    2. " - infobox += "
    " - infobox += "
" - - results.append( - { - 'infobox': word, - 'content': infobox, - } - ) - - return results diff --git a/apps/searxng/searx/engines/solidtorrents.py b/apps/searxng/searx/engines/solidtorrents.py deleted file mode 100755 index 9b5d543..0000000 --- a/apps/searxng/searx/engines/solidtorrents.py +++ /dev/null @@ -1,89 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""SolidTorrents -""" - -from datetime import datetime -from urllib.parse import urlencode -import random - -from lxml import html - -from searx.utils import ( - extract_text, - eval_xpath, - eval_xpath_getindex, - eval_xpath_list, - get_torrent_size, -) - -about = { - "website": 'https://www.solidtorrents.net/', - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -categories = ['files'] -paging = True - -# base_url can be overwritten by a list of URLs in the settings.yml -base_url = 'https://solidtorrents.net' - - -def request(query, params): - if isinstance(base_url, list): - params['base_url'] = random.choice(base_url) - else: - params['base_url'] = base_url - search_url = params['base_url'] + '/search?{query}' - page = (params['pageno'] - 1) * 20 - query = urlencode({'q': query, 'page': page}) - params['url'] = search_url.format(query=query) - return params - - -def response(resp): - results = [] - dom = html.fromstring(resp.text) - - for result in eval_xpath(dom, '//div[contains(@class, "search-result")]'): - a = eval_xpath_getindex(result, './div/h5/a', 0, None) - if a is None: - continue - title = extract_text(a) - url = eval_xpath_getindex(a, '@href', 0, None) - categ = eval_xpath(result, './div//a[contains(@class, "category")]') - metadata = extract_text(categ) - stats = eval_xpath_list(result, './div//div[contains(@class, "stats")]/div', min_len=5) - n, u = extract_text(stats[1]).split() - filesize = get_torrent_size(n, u) - leech = extract_text(stats[2]) - seed = extract_text(stats[3]) - torrentfile = eval_xpath_getindex(result, './div//a[contains(@class, "dl-torrent")]/@href', 0, None) - magnet = eval_xpath_getindex(result, './div//a[contains(@class, "dl-magnet")]/@href', 0, None) - - params = { - 'seed': seed, - 'leech': leech, - 'title': title, - 'url': resp.search_params['base_url'] + url, - 'filesize': filesize, - 'magnetlink': magnet, - 'torrentfile': torrentfile, - 'metadata': metadata, - 'template': "torrent.html", - } - - date_str = extract_text(stats[4]) - - try: - params['publishedDate'] = datetime.strptime(date_str, '%b %d, %Y') - except ValueError: - pass - - results.append(params) - - return results diff --git a/apps/searxng/searx/engines/solr.py b/apps/searxng/searx/engines/solr.py deleted file mode 100755 index 85ed42c..0000000 --- a/apps/searxng/searx/engines/solr.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. sidebar:: info - - - :origin:`solr.py ` - - `Solr `_ - - `Solr Resources `_ - - `Install Solr `_ - -Solr_ is a popular search engine based on Lucene, just like Elasticsearch_. But -instead of searching in indices, you can search in collections. - -Example -======= - -This is an example configuration for searching in the collection -``my-collection`` and get the results in ascending order. - -.. code:: yaml - - - name: solr - engine: solr - shortcut: slr - base_url: http://localhost:8983 - collection: my-collection - sort: asc - enable_http: true - -""" - -# pylint: disable=global-statement - -from json import loads -from urllib.parse import urlencode -from searx.exceptions import SearxEngineAPIException - - -base_url = 'http://localhost:8983' -collection = '' -rows = 10 -sort = '' # sorting: asc or desc -field_list = 'name' # list of field names to display on the UI -default_fields = '' # default field to query -query_fields = '' # query fields -_search_url = '' -paging = True - - -def init(_): - if collection == '': - raise ValueError('collection cannot be empty') - - global _search_url - _search_url = base_url + '/solr/' + collection + '/select?{params}' - - -def request(query, params): - query_params = {'q': query, 'rows': rows} - if field_list != '': - query_params['fl'] = field_list - if query_fields != '': - query_params['qf'] = query_fields - if default_fields != '': - query_params['df'] = default_fields - if sort != '': - query_params['sort'] = sort - - if 'pageno' in params: - query_params['start'] = rows * (params['pageno'] - 1) - - params['url'] = _search_url.format(params=urlencode(query_params)) - - return params - - -def response(resp): - resp_json = __get_response(resp) - - results = [] - for result in resp_json['response']['docs']: - r = {key: str(value) for key, value in result.items()} - if len(r) == 0: - continue - r['template'] = 'key-value.html' - results.append(r) - - return results - - -def __get_response(resp): - try: - resp_json = loads(resp.text) - except Exception as e: - raise SearxEngineAPIException("failed to parse response") from e - - if 'error' in resp_json: - raise SearxEngineAPIException(resp_json['error']['msg']) - - return resp_json diff --git a/apps/searxng/searx/engines/soundcloud.py b/apps/searxng/searx/engines/soundcloud.py deleted file mode 100755 index 78947c6..0000000 --- a/apps/searxng/searx/engines/soundcloud.py +++ /dev/null @@ -1,103 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Soundcloud (Music) -""" - -import re -from json import loads -from lxml import html -from dateutil import parser -from urllib.parse import quote_plus, urlencode -from searx.network import get as http_get - -# about -about = { - "website": 'https://soundcloud.com', - "wikidata_id": 'Q568769', - "official_api_documentation": 'https://developers.soundcloud.com/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['music'] -paging = True - -# search-url -# missing attribute: user_id, app_version, app_locale -url = 'https://api-v2.soundcloud.com/' -search_url = ( - url + 'search?{query}' - '&variant_ids=' - '&facet=model' - '&limit=20' - '&offset={offset}' - '&linked_partitioning=1' - '&client_id={client_id}' -) # noqa - -cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U) -guest_client_id = '' - - -def get_client_id(): - response = http_get("https://soundcloud.com") - - if response.ok: - tree = html.fromstring(response.content) - # script_tags has been moved from /assets/app/ to /assets/ path. I - # found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js - script_tags = tree.xpath("//script[contains(@src, '/assets/')]") - app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None] - - # extracts valid app_js urls from soundcloud.com content - for app_js_url in app_js_urls[::-1]: - # gets app_js and searches for the clientid - response = http_get(app_js_url) - if response.ok: - cids = cid_re.search(response.content.decode()) - if cids is not None and len(cids.groups()): - return cids.groups()[0] - logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") - return "" - - -def init(engine_settings=None): - global guest_client_id - # api-key - guest_client_id = get_client_id() - - -# do search-request -def request(query, params): - offset = (params['pageno'] - 1) * 20 - - params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset, client_id=guest_client_id) - - return params - - -# get response from search-request -def response(resp): - results = [] - search_res = loads(resp.text) - - # parse results - for result in search_res.get('collection', []): - - if result['kind'] in ('track', 'playlist'): - uri = quote_plus(result['uri']) - res = { - 'url': result['permalink_url'], - 'title': result['title'], - 'content': result['description'] or '', - 'publishedDate': parser.parse(result['last_modified']), - 'iframe_src': "https://w.soundcloud.com/player/?url=" + uri, - } - img_src = result['artwork_url'] or result['user']['avatar_url'] - if img_src: - res['img_src'] = img_src - results.append(res) - - return results diff --git a/apps/searxng/searx/engines/spotify.py b/apps/searxng/searx/engines/spotify.py deleted file mode 100755 index 87edb7f..0000000 --- a/apps/searxng/searx/engines/spotify.py +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Spotify (Music) -""" - -from json import loads -from urllib.parse import urlencode -import base64 - -from searx.network import post as http_post - -# about -about = { - "website": 'https://www.spotify.com', - "wikidata_id": 'Q689141', - "official_api_documentation": 'https://developer.spotify.com/web-api/search-item/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['music'] -paging = True -api_client_id = None -api_client_secret = None - -# search-url -url = 'https://api.spotify.com/' -search_url = url + 'v1/search?{query}&type=track&offset={offset}' - -# do search-request -def request(query, params): - offset = (params['pageno'] - 1) * 20 - - params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) - - r = http_post( - 'https://accounts.spotify.com/api/token', - data={'grant_type': 'client_credentials'}, - headers={ - 'Authorization': 'Basic ' - + base64.b64encode("{}:{}".format(api_client_id, api_client_secret).encode()).decode() - }, - ) - j = loads(r.text) - params['headers'] = {'Authorization': 'Bearer {}'.format(j.get('access_token'))} - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_res = loads(resp.text) - - # parse results - for result in search_res.get('tracks', {}).get('items', {}): - if result['type'] == 'track': - title = result['name'] - url = result['external_urls']['spotify'] - content = '{} - {} - {}'.format(result['artists'][0]['name'], result['album']['name'], result['name']) - - # append result - results.append( - { - 'url': url, - 'title': title, - 'iframe_src': "https://embed.spotify.com/?uri=spotify:track:" + result['id'], - 'content': content, - } - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/springer.py b/apps/searxng/searx/engines/springer.py deleted file mode 100755 index a4d0832..0000000 --- a/apps/searxng/searx/engines/springer.py +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Springer Nature (science) - -""" - -from datetime import datetime -from json import loads -from urllib.parse import urlencode - -from searx.exceptions import SearxEngineAPIException - -about = { - "website": 'https://www.springernature.com/', - "wikidata_id": 'Q21096327', - "official_api_documentation": 'https://dev.springernature.com/', - "use_official_api": True, - "require_api_key": True, - "results": 'JSON', -} - -categories = ['science', 'scientific publications'] -paging = True -nb_per_page = 10 -api_key = 'unset' - -base_url = 'https://api.springernature.com/metadata/json?' - - -def request(query, params): - if api_key == 'unset': - raise SearxEngineAPIException('missing Springer-Nature API key') - args = urlencode({'q': query, 's': nb_per_page * (params['pageno'] - 1), 'p': nb_per_page, 'api_key': api_key}) - params['url'] = base_url + args - logger.debug("query_url --> %s", params['url']) - return params - - -def response(resp): - results = [] - json_data = loads(resp.text) - - for record in json_data['records']: - published = datetime.strptime(record['publicationDate'], '%Y-%m-%d') - authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']] - tags = record.get('genre') - if isinstance(tags, str): - tags = [tags] - results.append( - { - 'template': 'paper.html', - 'url': record['url'][0]['value'].replace('http://', 'https://', 1), - 'title': record['title'], - 'content': record['abstract'], - 'comments': record['publicationName'], - 'tags': tags, - 'publishedDate': published, - 'type': record.get('contentType'), - 'authors': authors, - # 'editor': '', - 'publisher': record.get('publisher'), - 'journal': record.get('publicationName'), - 'volume': record.get('volume') or None, - 'pages': '-'.join([x for x in [record.get('startingPage'), record.get('endingPage')] if x]), - 'number': record.get('number') or None, - 'doi': record.get('doi'), - 'issn': [x for x in [record.get('issn')] if x], - 'isbn': [x for x in [record.get('isbn')] if x], - # 'pdf_url' : '' - } - ) - return results diff --git a/apps/searxng/searx/engines/sqlite.py b/apps/searxng/searx/engines/sqlite.py deleted file mode 100755 index c86df58..0000000 --- a/apps/searxng/searx/engines/sqlite.py +++ /dev/null @@ -1,101 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""SQLite is a small, fast and reliable SQL database engine. It does not require -any extra dependency. - -Example -======= - -.. _MediathekView: https://mediathekview.de/ - -To demonstrate the power of database engines, here is a more complex example -which reads from a MediathekView_ (DE) movie database. For this example of the -SQlite engine download the database: - -- https://liste.mediathekview.de/filmliste-v2.db.bz2 - -and unpack into ``searx/data/filmliste-v2.db``. To search the database use e.g -Query to test: ``!mediathekview concert`` - -.. code:: yaml - - - name: mediathekview - engine: sqlite - disabled: False - categories: general - result_template: default.html - database: searx/data/filmliste-v2.db - query_str: >- - SELECT title || ' (' || time(duration, 'unixepoch') || ')' AS title, - COALESCE( NULLIF(url_video_hd,''), NULLIF(url_video_sd,''), url_video) AS url, - description AS content - FROM film - WHERE title LIKE :wildcard OR description LIKE :wildcard - ORDER BY duration DESC - -Implementations -=============== - -""" - -import sqlite3 -import contextlib - -engine_type = 'offline' -database = "" -query_str = "" -limit = 10 -paging = True -result_template = 'key-value.html' - - -def init(engine_settings): - if 'query_str' not in engine_settings: - raise ValueError('query_str cannot be empty') - - if not engine_settings['query_str'].lower().startswith('select '): - raise ValueError('only SELECT query is supported') - - -@contextlib.contextmanager -def sqlite_cursor(): - """Implements a :py:obj:`Context Manager ` for a - :py:obj:`sqlite3.Cursor`. - - Open database in read only mode: if the database doesn't exist. The default - mode creates an empty file on the file system. See: - - * https://docs.python.org/3/library/sqlite3.html#sqlite3.connect - * https://www.sqlite.org/uri.html - - """ - uri = 'file:' + database + '?mode=ro' - with contextlib.closing(sqlite3.connect(uri, uri=True)) as connect: - connect.row_factory = sqlite3.Row - with contextlib.closing(connect.cursor()) as cursor: - yield cursor - - -def search(query, params): - results = [] - - query_params = { - 'query': query, - 'wildcard': r'%' + query.replace(' ', r'%') + r'%', - 'limit': limit, - 'offset': (params['pageno'] - 1) * limit, - } - query_to_run = query_str + ' LIMIT :limit OFFSET :offset' - - with sqlite_cursor() as cur: - - cur.execute(query_to_run, query_params) - col_names = [cn[0] for cn in cur.description] - - for row in cur.fetchall(): - item = dict(zip(col_names, map(str, row))) - item['template'] = result_template - logger.debug("append result --> %s", item) - results.append(item) - - return results diff --git a/apps/searxng/searx/engines/stackexchange.py b/apps/searxng/searx/engines/stackexchange.py deleted file mode 100755 index 99615b1..0000000 --- a/apps/searxng/searx/engines/stackexchange.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Stack Exchange API v2.3 - -* https://api.stackexchange.com/ - -""" - -import html -from json import loads -from urllib.parse import urlencode - -about = { - "website": 'https://stackexchange.com', - "wikidata_id": 'Q3495447', - "official_api_documentation": 'https://api.stackexchange.com/docs', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -paging = True -pagesize = 10 - -api_site = 'stackoverflow' -api_sort = 'activity' -api_order = 'desc' - -# https://api.stackexchange.com/docs/advanced-search -search_api = 'https://api.stackexchange.com/2.3/search/advanced?' - - -def request(query, params): - - args = urlencode( - { - 'q': query, - 'page': params['pageno'], - 'pagesize': pagesize, - 'site': api_site, - 'sort': api_sort, - 'order': 'desc', - } - ) - params['url'] = search_api + args - - return params - - -def response(resp): - - results = [] - json_data = loads(resp.text) - - for result in json_data['items']: - - content = "[%s]" % ", ".join(result['tags']) - content += " %s" % result['owner']['display_name'] - if result['is_answered']: - content += ' // is answered' - content += " // score: %s" % result['score'] - - results.append( - { - 'url': "https://%s.com/q/%s" % (api_site, result['question_id']), - 'title': html.unescape(result['title']), - 'content': html.unescape(content), - } - ) - - return results diff --git a/apps/searxng/searx/engines/startpage.py b/apps/searxng/searx/engines/startpage.py deleted file mode 100755 index 92d6986..0000000 --- a/apps/searxng/searx/engines/startpage.py +++ /dev/null @@ -1,494 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Startpage's language & region selectors are a mess .. - -.. _startpage regions: - -Startpage regions -================= - -In the list of regions there are tags we need to map to common region tags:: - - pt-BR_BR --> pt_BR - zh-CN_CN --> zh_Hans_CN - zh-TW_TW --> zh_Hant_TW - zh-TW_HK --> zh_Hant_HK - en-GB_GB --> en_GB - -and there is at least one tag with a three letter language tag (ISO 639-2):: - - fil_PH --> fil_PH - -The locale code ``no_NO`` from Startpage does not exists and is mapped to -``nb-NO``:: - - babel.core.UnknownLocaleError: unknown locale 'no_NO' - -For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and -W3C recommends subtag over macrolanguage [2]_. - -.. [1] `iana: language-subtag-registry - `_ :: - - type: language - Subtag: nb - Description: Norwegian Bokmål - Added: 2005-10-16 - Suppress-Script: Latn - Macrolanguage: no - -.. [2] - Use macrolanguages with care. Some language subtags have a Scope field set to - macrolanguage, i.e. this primary language subtag encompasses a number of more - specific primary language subtags in the registry. ... As we recommended for - the collection subtags mentioned above, in most cases you should try to use - the more specific subtags ... `W3: The primary language subtag - `_ - -.. _startpage languages: - -Startpage languages -=================== - -:py:obj:`send_accept_language_header`: - The displayed name in Startpage's settings page depend on the location of the - IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits` - we use:: - - 'Accept-Language': "en-US,en;q=0.5", - .. - - to get uniform names independent from the IP). - -.. _startpage categories: - -Startpage categories -==================== - -Startpage's category (for Web-search, News, Videos, ..) is set by -:py:obj:`startpage_categ` in settings.yml:: - - - name: startpage - engine: startpage - startpage_categ: web - ... - -.. hint:: - - The default category is ``web`` .. and other categories than ``web`` are not - yet implemented. - -""" - -from typing import TYPE_CHECKING -from collections import OrderedDict -import re -from unicodedata import normalize, combining -from time import time -from datetime import datetime, timedelta - -import dateutil.parser -import lxml.html -import babel - -from searx.utils import extract_text, eval_xpath, gen_useragent -from searx.network import get # see https://github.com/searxng/searxng/issues/762 -from searx.exceptions import SearxEngineCaptchaException -from searx.locales import region_tag -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -# about -about = { - "website": 'https://startpage.com', - "wikidata_id": 'Q2333295', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -startpage_categ = 'web' -"""Startpage's category, visit :ref:`startpage categories`. -""" - -send_accept_language_header = True -"""Startpage tries to guess user's language and territory from the HTTP -``Accept-Language``. Optional the user can select a search-language (can be -different to the UI language) and a region filter. -""" - -# engine dependent config -categories = ['general', 'web'] -paging = True -time_range_support = True -safesearch = True - -time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} -safesearch_dict = {0: '0', 1: '1', 2: '1'} - -# search-url -base_url = 'https://www.startpage.com' -search_url = base_url + '/sp/search' - -# specific xpath variables -# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] -# not ads: div[@class="result"] are the direct childs of div[@id="results"] -results_xpath = '//div[@class="w-gl__result__main"]' -link_xpath = './/a[@class="w-gl__result-title result-link"]' -content_xpath = './/p[@class="w-gl__description"]' -search_form_xpath = '//form[@id="search"]' -"""XPath of Startpage's origin search form - -.. code: html - -
- - - - - - -
-""" - -# timestamp of the last fetch of 'sc' code -sc_code_ts = 0 -sc_code = '' -sc_code_cache_sec = 30 -"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`.""" - - -def get_sc_code(searxng_locale, params): - """Get an actual ``sc`` argument from Startpage's search form (HTML page). - - Startpage puts a ``sc`` argument on every HTML :py:obj:`search form - `. Without this argument Startpage considers the request - is from a bot. We do not know what is encoded in the value of the ``sc`` - argument, but it seems to be a kind of a *time-stamp*. - - Startpage's search form generates a new sc-code on each request. This - function scrap a new sc-code from Startpage's home page every - :py:obj:`sc_code_cache_sec` seconds. - - """ - - global sc_code_ts, sc_code # pylint: disable=global-statement - - if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)): - logger.debug("get_sc_code: reuse '%s'", sc_code) - return sc_code - - headers = {**params['headers']} - headers['Origin'] = base_url - headers['Referer'] = base_url + '/' - # headers['Connection'] = 'keep-alive' - # headers['Accept-Encoding'] = 'gzip, deflate, br' - # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' - # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0' - - # add Accept-Language header - if searxng_locale == 'all': - searxng_locale = 'en-US' - locale = babel.Locale.parse(searxng_locale, sep='-') - - if send_accept_language_header: - ac_lang = locale.language - if locale.territory: - ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( - locale.language, - locale.territory, - locale.language, - ) - headers['Accept-Language'] = ac_lang - - get_sc_url = base_url + '/?sc=%s' % (sc_code) - logger.debug("query new sc time-stamp ... %s", get_sc_url) - logger.debug("headers: %s", headers) - resp = get(get_sc_url, headers=headers) - - # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers) - # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg - # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21 - - if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore - raise SearxEngineCaptchaException( - message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha", - ) - - dom = lxml.html.fromstring(resp.text) # type: ignore - - try: - sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0] - except IndexError as exc: - logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695") - raise SearxEngineCaptchaException( - message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, # type: ignore - ) from exc - - sc_code_ts = time() - logger.debug("get_sc_code: new value is: %s", sc_code) - return sc_code - - -def request(query, params): - """Assemble a Startpage request. - - To avoid CAPTCHA we need to send a well formed HTTP POST request with a - cookie. We need to form a request that is identical to the request build by - Startpage's search form: - - - in the cookie the **region** is selected - - in the HTTP POST data the **language** is selected - - Additionally the arguments form Startpage's search form needs to be set in - HTML POST data / compare ```` elements: :py:obj:`search_form_xpath`. - """ - if startpage_categ == 'web': - return _request_cat_web(query, params) - - logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) - return params - - -def _request_cat_web(query, params): - - engine_region = traits.get_region(params['searxng_locale'], 'en-US') - engine_language = traits.get_language(params['searxng_locale'], 'en') - - # build arguments - args = { - 'query': query, - 'cat': 'web', - 't': 'device', - 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers, - 'with_date': time_range_dict.get(params['time_range'], ''), - } - - if engine_language: - args['language'] = engine_language - args['lui'] = engine_language - - args['abp'] = '1' - if params['pageno'] > 1: - args['page'] = params['pageno'] - - # build cookie - lang_homepage = 'en' - cookie = OrderedDict() - cookie['date_time'] = 'world' - cookie['disable_family_filter'] = safesearch_dict[params['safesearch']] - cookie['disable_open_in_new_window'] = '0' - cookie['enable_post_method'] = '1' # hint: POST - cookie['enable_proxy_safety_suggest'] = '1' - cookie['enable_stay_control'] = '1' - cookie['instant_answers'] = '1' - cookie['lang_homepage'] = 's/device/%s/' % lang_homepage - cookie['num_of_results'] = '10' - cookie['suggestions'] = '1' - cookie['wt_unit'] = 'celsius' - - if engine_language: - cookie['language'] = engine_language - cookie['language_ui'] = engine_language - - if engine_region: - cookie['search_results_region'] = engine_region - - params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()]) - logger.debug('cookie preferences: %s', params['cookies']['preferences']) - - # POST request - logger.debug("data: %s", args) - params['data'] = args - params['method'] = 'POST' - params['url'] = search_url - params['headers']['Origin'] = base_url - params['headers']['Referer'] = base_url + '/' - # is the Accept header needed? - # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - - return params - - -# get response from search-request -def response(resp): - dom = lxml.html.fromstring(resp.text) - - if startpage_categ == 'web': - return _response_cat_web(dom) - - logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) - return [] - - -def _response_cat_web(dom): - results = [] - - # parse results - for result in eval_xpath(dom, results_xpath): - links = eval_xpath(result, link_xpath) - if not links: - continue - link = links[0] - url = link.attrib.get('href') - - # block google-ad url's - if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): - continue - - # block startpage search url's - if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): - continue - - title = extract_text(link) - - if eval_xpath(result, content_xpath): - content: str = extract_text(eval_xpath(result, content_xpath)) # type: ignore - else: - content = '' - - published_date = None - - # check if search result starts with something like: "2 Sep 2014 ... " - if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): - date_pos = content.find('...') + 4 - date_string = content[0 : date_pos - 5] - # fix content string - content = content[date_pos:] - - try: - published_date = dateutil.parser.parse(date_string, dayfirst=True) - except ValueError: - pass - - # check if search result starts with something like: "5 days ago ... " - elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): - date_pos = content.find('...') + 4 - date_string = content[0 : date_pos - 5] - - # calculate datetime - published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore - - # fix content string - content = content[date_pos:] - - if published_date: - # append result - results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date}) - else: - # append result - results.append({'url': url, 'title': title, 'content': content}) - - # return results - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch :ref:`languages ` and :ref:`regions ` from Startpage.""" - # pylint: disable=too-many-branches - - headers = { - 'User-Agent': gen_useragent(), - 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language - } - resp = get('https://www.startpage.com/do/settings', headers=headers) - - if not resp.ok: # type: ignore - print("ERROR: response from Startpage is not OK.") - - dom = lxml.html.fromstring(resp.text) # type: ignore - - # regions - - sp_region_names = [] - for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'): - sp_region_names.append(option.get('value')) - - for eng_tag in sp_region_names: - if eng_tag == 'all': - continue - babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway - - if '-' in babel_region_tag: - l, r = babel_region_tag.split('-') - r = r.split('_')[-1] - sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_')) - - else: - try: - sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_')) - - except babel.UnknownLocaleError: - print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag) - continue - - conflict = engine_traits.regions.get(sxng_tag) - if conflict: - if conflict != eng_tag: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) - continue - engine_traits.regions[sxng_tag] = eng_tag - - # languages - - catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()} - - # get the native name of every language known by babel - - for lang_code in filter( - lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers() # type: ignore - ): - native_name = babel.Locale(lang_code).get_language_name().lower() # type: ignore - # add native name exactly as it is - catalog_engine2code[native_name] = lang_code - - # add "normalized" language name (i.e. français becomes francais and español becomes espanol) - unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name))) - if len(unaccented_name) == len(unaccented_name.encode()): - # add only if result is ascii (otherwise "normalization" didn't work) - catalog_engine2code[unaccented_name] = lang_code - - # values that can't be determined by babel's languages names - - catalog_engine2code.update( - { - # traditional chinese used in .. - 'fantizhengwen': 'zh_Hant', - # Korean alphabet - 'hangul': 'ko', - # Malayalam is one of 22 scheduled languages of India. - 'malayam': 'ml', - 'norsk': 'nb', - 'sinhalese': 'si', - } - ) - - skip_eng_tags = { - 'english_uk', # SearXNG lang 'en' already maps to 'english' - } - - for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'): - - eng_tag = option.get('value') - if eng_tag in skip_eng_tags: - continue - name = extract_text(option).lower() # type: ignore - - sxng_tag = catalog_engine2code.get(eng_tag) - if sxng_tag is None: - sxng_tag = catalog_engine2code[name] - - conflict = engine_traits.languages.get(sxng_tag) - if conflict: - if conflict != eng_tag: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) - continue - engine_traits.languages[sxng_tag] = eng_tag diff --git a/apps/searxng/searx/engines/tagesschau.py b/apps/searxng/searx/engines/tagesschau.py deleted file mode 100755 index 4a36747..0000000 --- a/apps/searxng/searx/engines/tagesschau.py +++ /dev/null @@ -1,101 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""ARD: `Tagesschau API`_ - -The Tagesschau is a news program of the ARD. Via the `Tagesschau API`_, current -news and media reports are available in JSON format. The `Bundesstelle für Open -Data`_ offers a `OpenAPI`_ portal at bundDEV_ where APIs are documented an can -be tested. - -This SearXNG engine uses the `/api2u/search`_ API. - -.. _/api2u/search: http://tagesschau.api.bund.dev/ -.. _bundDEV: https://bund.dev/apis -.. _Bundesstelle für Open Data: https://github.com/bundesAPI -.. _Tagesschau API: https://github.com/AndreasFischer1985/tagesschau-api/blob/main/README_en.md -.. _OpenAPI: https://swagger.io/specification/ - -""" -from typing import TYPE_CHECKING - -from datetime import datetime -from urllib.parse import urlencode -import re - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -about = { - 'website': "https://tagesschau.de", - 'wikidata_id': "Q703907", - 'official_api_documentation': None, - 'use_official_api': True, - 'require_api_key': False, - 'results': 'JSON', - 'language': 'de', -} -categories = ['general', 'news'] -paging = True - -results_per_page = 10 -base_url = "https://www.tagesschau.de" - - -def request(query, params): - args = { - 'searchText': query, - 'pageSize': results_per_page, - 'resultPage': params['pageno'] - 1, - } - - params['url'] = f"{base_url}/api2u/search?{urlencode(args)}" - - return params - - -def response(resp): - results = [] - - json = resp.json() - - for item in json['searchResults']: - item_type = item.get('type') - if item_type in ('story', 'webview'): - results.append(_story(item)) - elif item_type == 'video': - results.append(_video(item)) - else: - logger.error("unknow result type: %s", item_type) - - return results - - -def _story(item): - return { - 'title': item['title'], - 'thumbnail': item.get('teaserImage', {}).get('imageVariants', {}).get('16x9-256'), - 'publishedDate': datetime.strptime(item['date'][:19], '%Y-%m-%dT%H:%M:%S'), - 'content': item['firstSentence'], - 'url': item['shareURL'], - } - - -def _video(item): - video_url = item['streams']['h264s'] - title = item['title'] - - if "_vapp.mxf" in title: - title = title.replace("_vapp.mxf", "") - title = re.sub(r"APP\d+ (FC-)?", "", title, count=1) - - return { - 'template': 'videos.html', - 'title': title, - 'thumbnail': item.get('teaserImage', {}).get('imageVariants', {}).get('16x9-256'), - 'publishedDate': datetime.strptime(item['date'][:19], '%Y-%m-%dT%H:%M:%S'), - 'content': item.get('firstSentence', ''), - 'iframe_src': video_url, - 'url': video_url, - } diff --git a/apps/searxng/searx/engines/tineye.py b/apps/searxng/searx/engines/tineye.py deleted file mode 100755 index 6c5ff13..0000000 --- a/apps/searxng/searx/engines/tineye.py +++ /dev/null @@ -1,225 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This engine implements *Tineye - reverse image search* - -Using TinEye, you can search by image or perform what we call a reverse image -search. You can do that by uploading an image or searching by URL. You can also -simply drag and drop your images to start your search. TinEye constantly crawls -the web and adds images to its index. Today, the TinEye index is over 50.2 -billion images `[tineye.com] `_. - -.. hint:: - - This SearXNG engine only supports *'searching by URL'* and it does not use - the official API `[api.tineye.com] `_. - -""" - -from urllib.parse import urlencode -from datetime import datetime -from flask_babel import gettext - -about = { - "website": 'https://tineye.com', - "wikidata_id": 'Q2382535', - "official_api_documentation": 'https://api.tineye.com/python/docs/', - "use_official_api": False, - "require_api_key": False, - "results": 'JSON', -} - -engine_type = 'online_url_search' -""":py:obj:`searx.search.processors.online_url_search`""" - -categories = ['general'] -paging = True -safesearch = False -base_url = 'https://tineye.com' -search_string = '/result_json/?page={page}&{query}' - -FORMAT_NOT_SUPPORTED = gettext( - "Could not read that image url. This may be due to an unsupported file" - " format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP." -) -"""TinEye error message""" - -NO_SIGNATURE_ERROR = gettext( - "The image is too simple to find matches. TinEye requires a basic level of" - " visual detail to successfully identify matches." -) -"""TinEye error message""" - -DOWNLOAD_ERROR = gettext("The image could not be downloaded.") -"""TinEye error message""" - - -def request(query, params): - """Build TinEye HTTP request using ``search_urls`` of a :py:obj:`engine_type`.""" - - params['raise_for_httperror'] = False - - if params['search_urls']['data:image']: - query = params['search_urls']['data:image'] - elif params['search_urls']['http']: - query = params['search_urls']['http'] - - logger.debug("query URL: %s", query) - query = urlencode({'url': query}) - - # see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py - params['url'] = base_url + search_string.format(query=query, page=params['pageno']) - - params['headers'].update( - { - 'Connection': 'keep-alive', - 'Accept-Encoding': 'gzip, defalte, br', - 'Host': 'tineye.com', - 'DNT': '1', - 'TE': 'trailers', - } - ) - return params - - -def parse_tineye_match(match_json): - """Takes parsed JSON from the API server and turns it into a :py:obj:`dict` - object. - - Attributes `(class Match) `__ - - - `image_url`, link to the result image. - - `domain`, domain this result was found on. - - `score`, a number (0 to 100) that indicates how closely the images match. - - `width`, image width in pixels. - - `height`, image height in pixels. - - `size`, image area in pixels. - - `format`, image format. - - `filesize`, image size in bytes. - - `overlay`, overlay URL. - - `tags`, whether this match belongs to a collection or stock domain. - - - `backlinks`, a list of Backlink objects pointing to the original websites - and image URLs. List items are instances of :py:obj:`dict`, (`Backlink - `__): - - - `url`, the image URL to the image. - - `backlink`, the original website URL. - - `crawl_date`, the date the image was crawled. - - """ - - # HINT: there exists an alternative backlink dict in the domains list / e.g.:: - # - # match_json['domains'][0]['backlinks'] - - backlinks = [] - if "backlinks" in match_json: - - for backlink_json in match_json["backlinks"]: - if not isinstance(backlink_json, dict): - continue - - crawl_date = backlink_json.get("crawl_date") - if crawl_date: - crawl_date = datetime.fromisoformat(crawl_date[:-3]) - else: - crawl_date = datetime.min - - backlinks.append( - { - 'url': backlink_json.get("url"), - 'backlink': backlink_json.get("backlink"), - 'crawl_date': crawl_date, - 'image_name': backlink_json.get("image_name"), - } - ) - - return { - 'image_url': match_json.get("image_url"), - 'domain': match_json.get("domain"), - 'score': match_json.get("score"), - 'width': match_json.get("width"), - 'height': match_json.get("height"), - 'size': match_json.get("size"), - 'image_format': match_json.get("format"), - 'filesize': match_json.get("filesize"), - 'overlay': match_json.get("overlay"), - 'tags': match_json.get("tags"), - 'backlinks': backlinks, - } - - -def response(resp): - """Parse HTTP response from TinEye.""" - results = [] - - try: - json_data = resp.json() - except Exception as exc: # pylint: disable=broad-except - msg = "can't parse JSON response // %s" % exc - logger.error(msg) - json_data = {'error': msg} - - # handle error codes from Tineye - - if resp.is_error: - if resp.status_code in (400, 422): - - message = 'HTTP status: %s' % resp.status_code - error = json_data.get('error') - s_key = json_data.get('suggestions', {}).get('key', '') - - if error and s_key: - message = "%s (%s)" % (error, s_key) - elif error: - message = error - - if s_key == "Invalid image URL": - # test https://docs.searxng.org/_static/searxng-wordmark.svg - message = FORMAT_NOT_SUPPORTED - elif s_key == 'NO_SIGNATURE_ERROR': - # test https://pngimg.com/uploads/dot/dot_PNG4.png - message = NO_SIGNATURE_ERROR - elif s_key == 'Download Error': - # test https://notexists - message = DOWNLOAD_ERROR - - # see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023 - # results.append({'answer': message}) - logger.error(message) - - return results - - resp.raise_for_status() - - # append results from matches - - for match_json in json_data['matches']: - - tineye_match = parse_tineye_match(match_json) - if not tineye_match['backlinks']: - continue - - backlink = tineye_match['backlinks'][0] - results.append( - { - 'template': 'images.html', - 'url': backlink['backlink'], - 'thumbnail_src': tineye_match['image_url'], - 'source': backlink['url'], - 'title': backlink['image_name'], - 'img_src': backlink['url'], - 'format': tineye_match['image_format'], - 'widht': tineye_match['width'], - 'height': tineye_match['height'], - 'publishedDate': backlink['crawl_date'], - } - ) - - # append number of results - - number_of_results = json_data.get('num_matches') - if number_of_results: - results.append({'number_of_results': number_of_results}) - - return results diff --git a/apps/searxng/searx/engines/tokyotoshokan.py b/apps/searxng/searx/engines/tokyotoshokan.py deleted file mode 100755 index b01de38..0000000 --- a/apps/searxng/searx/engines/tokyotoshokan.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Tokyo Toshokan (A BitTorrent Library for Japanese Media) -""" - -import re -from urllib.parse import urlencode -from lxml import html -from datetime import datetime -from searx.utils import extract_text, get_torrent_size, int_or_zero - -# about -about = { - "website": 'https://www.tokyotosho.info/', - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['files'] -paging = True - -# search-url -base_url = 'https://www.tokyotosho.info/' -search_url = base_url + 'search.php?{query}' - - -# do search-request -def request(query, params): - query = urlencode({'page': params['pageno'], 'terms': query}) - params['url'] = search_url.format(query=query) - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]') - - # check if there are no results or page layout was changed so we cannot parse it - # currently there are two rows for each result, so total count must be even - if len(rows) == 0 or len(rows) % 2 != 0: - return [] - - # regular expression for parsing torrent size strings - size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) - - # processing the results, two rows at a time - for i in range(0, len(rows), 2): - # parse the first row - name_row = rows[i] - - links = name_row.xpath('./td[@class="desc-top"]/a') - params = {'template': 'torrent.html', 'url': links[-1].attrib.get('href'), 'title': extract_text(links[-1])} - # I have not yet seen any torrents without magnet links, but - # it's better to be prepared to stumble upon one some day - if len(links) == 2: - magnet = links[0].attrib.get('href') - if magnet.startswith('magnet'): - # okay, we have a valid magnet link, let's add it to the result - params['magnetlink'] = magnet - - # no more info in the first row, start parsing the second one - info_row = rows[i + 1] - desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0]) - for item in desc.split('|'): - item = item.strip() - if item.startswith('Size:'): - try: - # ('1.228', 'GB') - groups = size_re.match(item).groups() - params['filesize'] = get_torrent_size(groups[0], groups[1]) - except: - pass - elif item.startswith('Date:'): - try: - # Date: 2016-02-21 21:44 UTC - date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC') - params['publishedDate'] = date - except: - pass - elif item.startswith('Comment:'): - params['content'] = item - stats = info_row.xpath('./td[@class="stats"]/span') - # has the layout not changed yet? - if len(stats) == 3: - params['seed'] = int_or_zero(extract_text(stats[0])) - params['leech'] = int_or_zero(extract_text(stats[1])) - - results.append(params) - - return results diff --git a/apps/searxng/searx/engines/torznab.py b/apps/searxng/searx/engines/torznab.py deleted file mode 100755 index 0692d4a..0000000 --- a/apps/searxng/searx/engines/torznab.py +++ /dev/null @@ -1,243 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Torznab_ is an API specification that provides a standardized way to query -torrent site for content. It is used by a number of torrent applications, -including Prowlarr_ and Jackett_. - -Using this engine together with Prowlarr_ or Jackett_ allows you to search -a huge number of torrent sites which are not directly supported. - -Configuration -============= - -The engine has the following settings: - -``base_url``: - Torznab endpoint URL. - -``api_key``: - The API key to use for authentication. - -``torznab_categories``: - The categories to use for searching. This is a list of category IDs. See - Prowlarr-categories_ or Jackett-categories_ for more information. - -``show_torrent_files``: - Whether to show the torrent file in the search results. Be carful as using - this with Prowlarr_ or Jackett_ leaks the API key. This should be used only - if you are querying a Torznab endpoint without authentication or if the - instance is private. Be aware that private trackers may ban you if you share - the torrent file. Defaults to ``false``. - -``show_magnet_links``: - Whether to show the magnet link in the search results. Be aware that private - trackers may ban you if you share the magnet link. Defaults to ``true``. - -.. _Torznab: - https://torznab.github.io/spec-1.3-draft/index.html -.. _Prowlarr: - https://github.com/Prowlarr/Prowlarr -.. _Jackett: - https://github.com/Jackett/Jackett -.. _Prowlarr-categories: - https://wiki.servarr.com/en/prowlarr/cardigann-yml-definition#categories -.. _Jackett-categories: - https://github.com/Jackett/Jackett/wiki/Jackett-Categories - -Implementations -=============== - -""" -from __future__ import annotations -from typing import TYPE_CHECKING - -from typing import List, Dict, Any -from datetime import datetime -from urllib.parse import quote -from lxml import etree # type: ignore - -from searx.exceptions import SearxEngineAPIException - -if TYPE_CHECKING: - import httpx - import logging - - logger: logging.Logger - -# engine settings -about: Dict[str, Any] = { - "website": None, - "wikidata_id": None, - "official_api_documentation": "https://torznab.github.io/spec-1.3-draft", - "use_official_api": True, - "require_api_key": False, - "results": 'XML', -} -categories: List[str] = ['files'] -paging: bool = False -time_range_support: bool = False - -# defined in settings.yml -# example (Jackett): "http://localhost:9117/api/v2.0/indexers/all/results/torznab" -base_url: str = '' -api_key: str = '' -# https://newznab.readthedocs.io/en/latest/misc/api/#predefined-categories -torznab_categories: List[str] = [] -show_torrent_files: bool = False -show_magnet_links: bool = True - - -def init(engine_settings=None): # pylint: disable=unused-argument - """Initialize the engine.""" - if len(base_url) < 1: - raise ValueError('missing torznab base_url') - - -def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]: - """Build the request params.""" - search_url: str = base_url + '?t=search&q={search_query}' - - if len(api_key) > 0: - search_url += '&apikey={api_key}' - if len(torznab_categories) > 0: - search_url += '&cat={torznab_categories}' - - params['url'] = search_url.format( - search_query=quote(query), api_key=api_key, torznab_categories=",".join([str(x) for x in torznab_categories]) - ) - - return params - - -def response(resp: httpx.Response) -> List[Dict[str, Any]]: - """Parse the XML response and return a list of results.""" - results = [] - search_results = etree.XML(resp.content) - - # handle errors: https://newznab.readthedocs.io/en/latest/misc/api/#newznab-error-codes - if search_results.tag == "error": - raise SearxEngineAPIException(search_results.get("description")) - - channel: etree.Element = search_results[0] - - item: etree.Element - for item in channel.iterfind('item'): - result: Dict[str, Any] = build_result(item) - results.append(result) - - return results - - -def build_result(item: etree.Element) -> Dict[str, Any]: - """Build a result from a XML item.""" - - # extract attributes from XML - # see https://torznab.github.io/spec-1.3-draft/torznab/Specification-v1.3.html#predefined-attributes - enclosure: etree.Element | None = item.find('enclosure') - enclosure_url: str | None = None - if enclosure is not None: - enclosure_url = enclosure.get('url') - - size = get_attribute(item, 'size') - if not size and enclosure: - size = enclosure.get('length') - if size: - size = int(size) - - guid = get_attribute(item, 'guid') - comments = get_attribute(item, 'comments') - pubDate = get_attribute(item, 'pubDate') - seeders = get_torznab_attribute(item, 'seeders') - leechers = get_torznab_attribute(item, 'leechers') - peers = get_torznab_attribute(item, 'peers') - - # map attributes to searx result - result: Dict[str, Any] = { - 'template': 'torrent.html', - 'title': get_attribute(item, 'title'), - 'filesize': size, - 'files': get_attribute(item, 'files'), - 'seed': seeders, - 'leech': _map_leechers(leechers, seeders, peers), - 'url': _map_result_url(guid, comments), - 'publishedDate': _map_published_date(pubDate), - 'torrentfile': None, - 'magnetlink': None, - } - - link = get_attribute(item, 'link') - if show_torrent_files: - result['torrentfile'] = _map_torrent_file(link, enclosure_url) - if show_magnet_links: - magneturl = get_torznab_attribute(item, 'magneturl') - result['magnetlink'] = _map_magnet_link(magneturl, guid, enclosure_url, link) - return result - - -def _map_result_url(guid: str | None, comments: str | None) -> str | None: - if guid and guid.startswith('http'): - return guid - if comments and comments.startswith('http'): - return comments - return None - - -def _map_leechers(leechers: str | None, seeders: str | None, peers: str | None) -> str | None: - if leechers: - return leechers - if seeders and peers: - return str(int(peers) - int(seeders)) - return None - - -def _map_published_date(pubDate: str | None) -> datetime | None: - if pubDate is not None: - try: - return datetime.strptime(pubDate, '%a, %d %b %Y %H:%M:%S %z') - except (ValueError, TypeError) as e: - logger.debug("ignore exception (publishedDate): %s", e) - return None - - -def _map_torrent_file(link: str | None, enclosure_url: str | None) -> str | None: - if link and link.startswith('http'): - return link - if enclosure_url and enclosure_url.startswith('http'): - return enclosure_url - return None - - -def _map_magnet_link( - magneturl: str | None, - guid: str | None, - enclosure_url: str | None, - link: str | None, -) -> str | None: - if magneturl and magneturl.startswith('magnet'): - return magneturl - if guid and guid.startswith('magnet'): - return guid - if enclosure_url and enclosure_url.startswith('magnet'): - return enclosure_url - if link and link.startswith('magnet'): - return link - return None - - -def get_attribute(item: etree.Element, property_name: str) -> str | None: - """Get attribute from item.""" - property_element: etree.Element | None = item.find(property_name) - if property_element is not None: - return property_element.text - return None - - -def get_torznab_attribute(item: etree.Element, attribute_name: str) -> str | None: - """Get torznab special attribute from item.""" - element: etree.Element | None = item.find( - './/torznab:attr[@name="{attribute_name}"]'.format(attribute_name=attribute_name), - {'torznab': 'http://torznab.com/schemas/2015/feed'}, - ) - if element is not None: - return element.get("value") - return None diff --git a/apps/searxng/searx/engines/translated.py b/apps/searxng/searx/engines/translated.py deleted file mode 100755 index 9900c01..0000000 --- a/apps/searxng/searx/engines/translated.py +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - MyMemory Translated -""" - -# about -about = { - "website": 'https://mymemory.translated.net/', - "wikidata_id": None, - "official_api_documentation": 'https://mymemory.translated.net/doc/spec.php', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -engine_type = 'online_dictionary' -categories = ['dictionaries'] -url = 'https://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' -web_url = 'https://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' -weight = 100 -https_support = True - -api_key = '' - - -def request(query, params): - if api_key: - key_form = '&key=' + api_key - else: - key_form = '' - params['url'] = url.format( - from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query'], key=key_form - ) - return params - - -def response(resp): - results = [] - results.append( - { - 'url': web_url.format( - from_lang=resp.search_params['from_lang'][2], - to_lang=resp.search_params['to_lang'][2], - query=resp.search_params['query'], - ), - 'title': '[{0}-{1}] {2}'.format( - resp.search_params['from_lang'][1], resp.search_params['to_lang'][1], resp.search_params['query'] - ), - 'content': resp.json()['responseData']['translatedText'], - } - ) - return results diff --git a/apps/searxng/searx/engines/twitter.py b/apps/searxng/searx/engines/twitter.py deleted file mode 100755 index 3ebe34b..0000000 --- a/apps/searxng/searx/engines/twitter.py +++ /dev/null @@ -1,75 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Twitter (microblogging platform)""" - -from json import loads -from urllib.parse import urlencode -from datetime import datetime - -about = { - "website": 'https://twitter.com', - "wikidata_id": None, - "official_api_documentation": 'https://developer.twitter.com/en/docs/twitter-api', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -categories = ['social media'] - -url = "https://api.twitter.com" -search_url = ( - "{url}/2/search/adaptive.json?{query}&tweet_mode=extended&query_source=typed_query&pc=1&spelling_corrections=1" -) - - -def request(query, params): - params['url'] = search_url.format(url=url, query=urlencode({'q': query})) - - params['headers'] = { - # This token is used in the Twitter web interface (twitter.com). Without this header, the API doesn't work. - # The value of the token has never changed (or maybe once a long time ago). - # https://github.com/zedeus/nitter/blob/5f31e86e0e8578377fa7d5aeb9631bbb2d35ef1e/src/consts.nim#L5 - 'Authorization': ( - "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKb" - "T3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw" - ) - } - - return params - - -def response(resp): - results = [] - - json_res = loads(resp.text)['globalObjects'] - - for tweet in json_res['tweets'].values(): - text = tweet['full_text'] - display = tweet['display_text_range'] - - img_src = tweet.get('extended_entities', {}).get('media', [{}])[0].get('media_url_https') - if img_src: - img_src += "?name=thumb" - - results.append( - { - 'url': 'https://twitter.com/i/web/status/' + tweet['id_str'], - 'title': (text[:40] + '...') if len(text) > 40 else text, - 'content': text[display[0] : display[1]], - 'img_src': img_src, - 'publishedDate': datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S %z %Y'), - } - ) - - for user in json_res['users'].values(): - results.append( - { - 'title': user['name'], - 'content': user['description'], - 'url': 'https://twitter.com/' + user['screen_name'], - 'img_src': user['profile_image_url_https'], - } - ) - - return results diff --git a/apps/searxng/searx/engines/unsplash.py b/apps/searxng/searx/engines/unsplash.py deleted file mode 100755 index 1967fef..0000000 --- a/apps/searxng/searx/engines/unsplash.py +++ /dev/null @@ -1,57 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Unsplash - -""" - -from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl -from json import loads - -# about -about = { - "website": 'https://unsplash.com', - "wikidata_id": 'Q28233552', - "official_api_documentation": 'https://unsplash.com/developers', - "use_official_api": False, - "require_api_key": False, - "results": 'JSON', -} - -base_url = 'https://unsplash.com/' -search_url = base_url + 'napi/search/photos?' -categories = ['images'] -page_size = 20 -paging = True - - -def clean_url(url): - parsed = urlparse(url) - query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']] - - return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment)) - - -def request(query, params): - params['url'] = search_url + urlencode({'query': query, 'page': params['pageno'], 'per_page': page_size}) - logger.debug("query_url --> %s", params['url']) - return params - - -def response(resp): - results = [] - json_data = loads(resp.text) - - if 'results' in json_data: - for result in json_data['results']: - results.append( - { - 'template': 'images.html', - 'url': clean_url(result['links']['html']), - 'thumbnail_src': clean_url(result['urls']['thumb']), - 'img_src': clean_url(result['urls']['raw']), - 'title': result.get('alt_description') or 'unknown', - 'content': result.get('description') or '', - } - ) - - return results diff --git a/apps/searxng/searx/engines/vimeo.py b/apps/searxng/searx/engines/vimeo.py deleted file mode 100755 index 2449345..0000000 --- a/apps/searxng/searx/engines/vimeo.py +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Wikipedia (Web -""" - -from urllib.parse import urlencode -from json import loads -from dateutil import parser - -# about -about = { - "website": 'https://vimeo.com/', - "wikidata_id": 'Q156376', - "official_api_documentation": 'http://developer.vimeo.com/api', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['videos'] -paging = True - -# search-url -base_url = 'https://vimeo.com/' -search_url = base_url + '/search/page:{pageno}?{query}' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(pageno=params['pageno'], query=urlencode({'q': query})) - - return params - - -# get response from search-request -def response(resp): - results = [] - data_start_pos = resp.text.find('{"filtered"') - data_end_pos = resp.text.find(';\n', data_start_pos + 1) - data = loads(resp.text[data_start_pos:data_end_pos]) - - # parse results - for result in data['filtered']['data']: - result = result[result['type']] - videoid = result['uri'].split('/')[-1] - url = base_url + videoid - title = result['name'] - thumbnail = result['pictures']['sizes'][-1]['link'] - publishedDate = parser.parse(result['created_time']) - - # append result - results.append( - { - 'url': url, - 'title': title, - 'content': '', - 'template': 'videos.html', - 'publishedDate': publishedDate, - 'iframe_src': "https://player.vimeo.com/video/" + videoid, - 'thumbnail': thumbnail, - } - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/wikidata.py b/apps/searxng/searx/engines/wikidata.py deleted file mode 100755 index 34d4081..0000000 --- a/apps/searxng/searx/engines/wikidata.py +++ /dev/null @@ -1,783 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This module implements the Wikidata engine. Some implementations are shared -from :ref:`wikipedia engine`. - -""" -# pylint: disable=missing-class-docstring - -from typing import TYPE_CHECKING -from hashlib import md5 -from urllib.parse import urlencode, unquote -from json import loads - -from dateutil.parser import isoparse -from babel.dates import format_datetime, format_date, format_time, get_datetime_format - -from searx.data import WIKIDATA_UNITS -from searx.network import post, get -from searx.utils import searx_useragent, get_string_replaces_function -from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom -from searx.engines.wikipedia import ( - fetch_wikimedia_traits, - get_wiki_params, -) -from searx.enginelib.traits import EngineTraits - -if TYPE_CHECKING: - import logging - - logger: logging.Logger - -traits: EngineTraits - -# about -about = { - "website": 'https://wikidata.org/', - "wikidata_id": 'Q2013', - "official_api_documentation": 'https://query.wikidata.org/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# SPARQL -SPARQL_ENDPOINT_URL = 'https://query.wikidata.org/sparql' -SPARQL_EXPLAIN_URL = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain' -WIKIDATA_PROPERTIES = { - 'P434': 'MusicBrainz', - 'P435': 'MusicBrainz', - 'P436': 'MusicBrainz', - 'P966': 'MusicBrainz', - 'P345': 'IMDb', - 'P2397': 'YouTube', - 'P1651': 'YouTube', - 'P2002': 'Twitter', - 'P2013': 'Facebook', - 'P2003': 'Instagram', -} - -# SERVICE wikibase:mwapi : https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI -# SERVICE wikibase:label: https://en.wikibooks.org/wiki/SPARQL/SERVICE_-_Label#Manual_Label_SERVICE -# https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates -# https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Data_model -# optimization: -# * https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization -# * https://github.com/blazegraph/database/wiki/QueryHints -QUERY_TEMPLATE = """ -SELECT ?item ?itemLabel ?itemDescription ?lat ?long %SELECT% -WHERE -{ - SERVICE wikibase:mwapi { - bd:serviceParam wikibase:endpoint "www.wikidata.org"; - wikibase:api "EntitySearch"; - wikibase:limit 1; - mwapi:search "%QUERY%"; - mwapi:language "%LANGUAGE%". - ?item wikibase:apiOutputItem mwapi:item. - } - hint:Prior hint:runFirst "true". - - %WHERE% - - SERVICE wikibase:label { - bd:serviceParam wikibase:language "%LANGUAGE%,en". - ?item rdfs:label ?itemLabel . - ?item schema:description ?itemDescription . - %WIKIBASE_LABELS% - } - -} -GROUP BY ?item ?itemLabel ?itemDescription ?lat ?long %GROUP_BY% -""" - -# Get the calendar names and the property names -QUERY_PROPERTY_NAMES = """ -SELECT ?item ?name -WHERE { - { - SELECT ?item - WHERE { ?item wdt:P279* wd:Q12132 } - } UNION { - VALUES ?item { %ATTRIBUTES% } - } - OPTIONAL { ?item rdfs:label ?name. } -} -""" - -# see the property "dummy value" of https://www.wikidata.org/wiki/Q2013 (Wikidata) -# hard coded here to avoid to an additional SPARQL request when the server starts -DUMMY_ENTITY_URLS = set( - "http://www.wikidata.org/entity/" + wid for wid in ("Q4115189", "Q13406268", "Q15397819", "Q17339402") -) - - -# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1 -# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html -sparql_string_escape = get_string_replaces_function( - # fmt: off - { - '\t': '\\\t', - '\n': '\\\n', - '\r': '\\\r', - '\b': '\\\b', - '\f': '\\\f', - '\"': '\\\"', - '\'': '\\\'', - '\\': '\\\\' - } - # fmt: on -) - -replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) - - -def get_headers(): - # user agent: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits - return {'Accept': 'application/sparql-results+json', 'User-Agent': searx_useragent()} - - -def get_label_for_entity(entity_id, language): - name = WIKIDATA_PROPERTIES.get(entity_id) - if name is None: - name = WIKIDATA_PROPERTIES.get((entity_id, language)) - if name is None: - name = WIKIDATA_PROPERTIES.get((entity_id, language.split('-')[0])) - if name is None: - name = WIKIDATA_PROPERTIES.get((entity_id, 'en')) - if name is None: - name = entity_id - return name - - -def send_wikidata_query(query, method='GET'): - if method == 'GET': - # query will be cached by wikidata - http_response = get(SPARQL_ENDPOINT_URL + '?' + urlencode({'query': query}), headers=get_headers()) - else: - # query won't be cached by wikidata - http_response = post(SPARQL_ENDPOINT_URL, data={'query': query}, headers=get_headers()) - if http_response.status_code != 200: - logger.debug('SPARQL endpoint error %s', http_response.content.decode()) - logger.debug('request time %s', str(http_response.elapsed)) - http_response.raise_for_status() - return loads(http_response.content.decode()) - - -def request(query, params): - - eng_tag, _wiki_netloc = get_wiki_params(params['searxng_locale'], traits) - query, attributes = get_query(query, eng_tag) - logger.debug("request --> language %s // len(attributes): %s", eng_tag, len(attributes)) - - params['method'] = 'POST' - params['url'] = SPARQL_ENDPOINT_URL - params['data'] = {'query': query} - params['headers'] = get_headers() - params['language'] = eng_tag - params['attributes'] = attributes - - return params - - -def response(resp): - - results = [] - jsonresponse = loads(resp.content.decode()) - - language = resp.search_params['language'] - attributes = resp.search_params['attributes'] - logger.debug("request --> language %s // len(attributes): %s", language, len(attributes)) - - seen_entities = set() - for result in jsonresponse.get('results', {}).get('bindings', []): - attribute_result = {key: value['value'] for key, value in result.items()} - entity_url = attribute_result['item'] - if entity_url not in seen_entities and entity_url not in DUMMY_ENTITY_URLS: - seen_entities.add(entity_url) - results += get_results(attribute_result, attributes, language) - else: - logger.debug('The SPARQL request returns duplicate entities: %s', str(attribute_result)) - - return results - - -_IMG_SRC_DEFAULT_URL_PREFIX = "https://commons.wikimedia.org/wiki/Special:FilePath/" -_IMG_SRC_NEW_URL_PREFIX = "https://upload.wikimedia.org/wikipedia/commons/thumb/" - - -def get_thumbnail(img_src): - """Get Thumbnail image from wikimedia commons - - Images from commons.wikimedia.org are (HTTP) redirected to - upload.wikimedia.org. The redirected URL can be calculated by this - function. - - - https://stackoverflow.com/a/33691240 - - """ - logger.debug('get_thumbnail(): %s', img_src) - if not img_src is None and _IMG_SRC_DEFAULT_URL_PREFIX in img_src.split()[0]: - img_src_name = unquote(img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[0].replace("%20", "_")) - img_src_name_first = img_src_name - img_src_name_second = img_src_name - - if ".svg" in img_src_name.split()[0]: - img_src_name_second = img_src_name + ".png" - - img_src_size = img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[1] - img_src_size = img_src_size[img_src_size.index("=") + 1 : img_src_size.index("&")] - img_src_name_md5 = md5(img_src_name.encode("utf-8")).hexdigest() - img_src = ( - _IMG_SRC_NEW_URL_PREFIX - + img_src_name_md5[0] - + "/" - + img_src_name_md5[0:2] - + "/" - + img_src_name_first - + "/" - + img_src_size - + "px-" - + img_src_name_second - ) - logger.debug('get_thumbnail() redirected: %s', img_src) - - return img_src - - -def get_results(attribute_result, attributes, language): - # pylint: disable=too-many-branches - results = [] - infobox_title = attribute_result.get('itemLabel') - infobox_id = attribute_result['item'] - infobox_id_lang = None - infobox_urls = [] - infobox_attributes = [] - infobox_content = attribute_result.get('itemDescription', []) - img_src = None - img_src_priority = 0 - - for attribute in attributes: - value = attribute.get_str(attribute_result, language) - if value is not None and value != '': - attribute_type = type(attribute) - - if attribute_type in (WDURLAttribute, WDArticle): - # get_select() method : there is group_concat(distinct ...;separator=", ") - # split the value here - for url in value.split(', '): - infobox_urls.append({'title': attribute.get_label(language), 'url': url, **attribute.kwargs}) - # "normal" results (not infobox) include official website and Wikipedia links. - if attribute.kwargs.get('official') or attribute_type == WDArticle: - results.append({'title': infobox_title, 'url': url, "content": infobox_content}) - # update the infobox_id with the wikipedia URL - # first the local wikipedia URL, and as fallback the english wikipedia URL - if attribute_type == WDArticle and ( - (attribute.language == 'en' and infobox_id_lang is None) or attribute.language != 'en' - ): - infobox_id_lang = attribute.language - infobox_id = url - elif attribute_type == WDImageAttribute: - # this attribute is an image. - # replace the current image only the priority is lower - # (the infobox contain only one image). - if attribute.priority > img_src_priority: - img_src = get_thumbnail(value) - img_src_priority = attribute.priority - elif attribute_type == WDGeoAttribute: - # geocoordinate link - # use the area to get the OSM zoom - # Note: ignre the unit (must be km² otherwise the calculation is wrong) - # Should use normalized value p:P2046/psn:P2046/wikibase:quantityAmount - area = attribute_result.get('P2046') - osm_zoom = area_to_osm_zoom(area) if area else 19 - url = attribute.get_geo_url(attribute_result, osm_zoom=osm_zoom) - if url: - infobox_urls.append({'title': attribute.get_label(language), 'url': url, 'entity': attribute.name}) - else: - infobox_attributes.append( - {'label': attribute.get_label(language), 'value': value, 'entity': attribute.name} - ) - - if infobox_id: - infobox_id = replace_http_by_https(infobox_id) - - # add the wikidata URL at the end - infobox_urls.append({'title': 'Wikidata', 'url': attribute_result['item']}) - - if img_src is None and len(infobox_attributes) == 0 and len(infobox_urls) == 1 and len(infobox_content) == 0: - results.append({'url': infobox_urls[0]['url'], 'title': infobox_title, 'content': infobox_content}) - else: - results.append( - { - 'infobox': infobox_title, - 'id': infobox_id, - 'content': infobox_content, - 'img_src': img_src, - 'urls': infobox_urls, - 'attributes': infobox_attributes, - } - ) - return results - - -def get_query(query, language): - attributes = get_attributes(language) - select = [a.get_select() for a in attributes] - where = list(filter(lambda s: len(s) > 0, [a.get_where() for a in attributes])) - wikibase_label = list(filter(lambda s: len(s) > 0, [a.get_wikibase_label() for a in attributes])) - group_by = list(filter(lambda s: len(s) > 0, [a.get_group_by() for a in attributes])) - query = ( - QUERY_TEMPLATE.replace('%QUERY%', sparql_string_escape(query)) - .replace('%SELECT%', ' '.join(select)) - .replace('%WHERE%', '\n '.join(where)) - .replace('%WIKIBASE_LABELS%', '\n '.join(wikibase_label)) - .replace('%GROUP_BY%', ' '.join(group_by)) - .replace('%LANGUAGE%', language) - ) - return query, attributes - - -def get_attributes(language): - # pylint: disable=too-many-statements - attributes = [] - - def add_value(name): - attributes.append(WDAttribute(name)) - - def add_amount(name): - attributes.append(WDAmountAttribute(name)) - - def add_label(name): - attributes.append(WDLabelAttribute(name)) - - def add_url(name, url_id=None, **kwargs): - attributes.append(WDURLAttribute(name, url_id, kwargs)) - - def add_image(name, url_id=None, priority=1): - attributes.append(WDImageAttribute(name, url_id, priority)) - - def add_date(name): - attributes.append(WDDateAttribute(name)) - - # Dates - for p in [ - 'P571', # inception date - 'P576', # dissolution date - 'P580', # start date - 'P582', # end date - 'P569', # date of birth - 'P570', # date of death - 'P619', # date of spacecraft launch - 'P620', - ]: # date of spacecraft landing - add_date(p) - - for p in [ - 'P27', # country of citizenship - 'P495', # country of origin - 'P17', # country - 'P159', - ]: # headquarters location - add_label(p) - - # Places - for p in [ - 'P36', # capital - 'P35', # head of state - 'P6', # head of government - 'P122', # basic form of government - 'P37', - ]: # official language - add_label(p) - - add_value('P1082') # population - add_amount('P2046') # area - add_amount('P281') # postal code - add_label('P38') # currency - add_amount('P2048') # height (building) - - # Media - for p in [ - 'P400', # platform (videogames, computing) - 'P50', # author - 'P170', # creator - 'P57', # director - 'P175', # performer - 'P178', # developer - 'P162', # producer - 'P176', # manufacturer - 'P58', # screenwriter - 'P272', # production company - 'P264', # record label - 'P123', # publisher - 'P449', # original network - 'P750', # distributed by - 'P86', - ]: # composer - add_label(p) - - add_date('P577') # publication date - add_label('P136') # genre (music, film, artistic...) - add_label('P364') # original language - add_value('P212') # ISBN-13 - add_value('P957') # ISBN-10 - add_label('P275') # copyright license - add_label('P277') # programming language - add_value('P348') # version - add_label('P840') # narrative location - - # Languages - add_value('P1098') # number of speakers - add_label('P282') # writing system - add_label('P1018') # language regulatory body - add_value('P218') # language code (ISO 639-1) - - # Other - add_label('P169') # ceo - add_label('P112') # founded by - add_label('P1454') # legal form (company, organization) - add_label('P137') # operator (service, facility, ...) - add_label('P1029') # crew members (tripulation) - add_label('P225') # taxon name - add_value('P274') # chemical formula - add_label('P1346') # winner (sports, contests, ...) - add_value('P1120') # number of deaths - add_value('P498') # currency code (ISO 4217) - - # URL - add_url('P856', official=True) # official website - attributes.append(WDArticle(language)) # wikipedia (user language) - if not language.startswith('en'): - attributes.append(WDArticle('en')) # wikipedia (english) - - add_url('P1324') # source code repository - add_url('P1581') # blog - add_url('P434', url_id='musicbrainz_artist') - add_url('P435', url_id='musicbrainz_work') - add_url('P436', url_id='musicbrainz_release_group') - add_url('P966', url_id='musicbrainz_label') - add_url('P345', url_id='imdb_id') - add_url('P2397', url_id='youtube_channel') - add_url('P1651', url_id='youtube_video') - add_url('P2002', url_id='twitter_profile') - add_url('P2013', url_id='facebook_profile') - add_url('P2003', url_id='instagram_profile') - - # Map - attributes.append(WDGeoAttribute('P625')) - - # Image - add_image('P15', priority=1, url_id='wikimedia_image') # route map - add_image('P242', priority=2, url_id='wikimedia_image') # locator map - add_image('P154', priority=3, url_id='wikimedia_image') # logo - add_image('P18', priority=4, url_id='wikimedia_image') # image - add_image('P41', priority=5, url_id='wikimedia_image') # flag - add_image('P2716', priority=6, url_id='wikimedia_image') # collage - add_image('P2910', priority=7, url_id='wikimedia_image') # icon - - return attributes - - -class WDAttribute: - __slots__ = ('name',) - - def __init__(self, name): - self.name = name - - def get_select(self): - return '(group_concat(distinct ?{name};separator=", ") as ?{name}s)'.replace('{name}', self.name) - - def get_label(self, language): - return get_label_for_entity(self.name, language) - - def get_where(self): - return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name) - - def get_wikibase_label(self): - return "" - - def get_group_by(self): - return "" - - def get_str(self, result, language): # pylint: disable=unused-argument - return result.get(self.name + 's') - - def __repr__(self): - return '<' + str(type(self).__name__) + ':' + self.name + '>' - - -class WDAmountAttribute(WDAttribute): - def get_select(self): - return '?{name} ?{name}Unit'.replace('{name}', self.name) - - def get_where(self): - return """ OPTIONAL { ?item p:{name} ?{name}Node . - ?{name}Node rdf:type wikibase:BestRank ; ps:{name} ?{name} . - OPTIONAL { ?{name}Node psv:{name}/wikibase:quantityUnit ?{name}Unit. } }""".replace( - '{name}', self.name - ) - - def get_group_by(self): - return self.get_select() - - def get_str(self, result, language): - value = result.get(self.name) - unit = result.get(self.name + "Unit") - if unit is not None: - unit = unit.replace('http://www.wikidata.org/entity/', '') - return value + " " + get_label_for_entity(unit, language) - return value - - -class WDArticle(WDAttribute): - - __slots__ = 'language', 'kwargs' - - def __init__(self, language, kwargs=None): - super().__init__('wikipedia') - self.language = language - self.kwargs = kwargs or {} - - def get_label(self, language): - # language parameter is ignored - return "Wikipedia ({language})".replace('{language}', self.language) - - def get_select(self): - return "?article{language} ?articleName{language}".replace('{language}', self.language) - - def get_where(self): - return """OPTIONAL { ?article{language} schema:about ?item ; - schema:inLanguage "{language}" ; - schema:isPartOf ; - schema:name ?articleName{language} . }""".replace( - '{language}', self.language - ) - - def get_group_by(self): - return self.get_select() - - def get_str(self, result, language): - key = 'article{language}'.replace('{language}', self.language) - return result.get(key) - - -class WDLabelAttribute(WDAttribute): - def get_select(self): - return '(group_concat(distinct ?{name}Label;separator=", ") as ?{name}Labels)'.replace('{name}', self.name) - - def get_where(self): - return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name) - - def get_wikibase_label(self): - return "?{name} rdfs:label ?{name}Label .".replace('{name}', self.name) - - def get_str(self, result, language): - return result.get(self.name + 'Labels') - - -class WDURLAttribute(WDAttribute): - - HTTP_WIKIMEDIA_IMAGE = 'http://commons.wikimedia.org/wiki/Special:FilePath/' - - __slots__ = 'url_id', 'kwargs' - - def __init__(self, name, url_id=None, kwargs=None): - super().__init__(name) - self.url_id = url_id - self.kwargs = kwargs - - def get_str(self, result, language): - value = result.get(self.name + 's') - if self.url_id and value is not None and value != '': - value = value.split(',')[0] - url_id = self.url_id - if value.startswith(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE): - value = value[len(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE) :] - url_id = 'wikimedia_image' - return get_external_url(url_id, value) - return value - - -class WDGeoAttribute(WDAttribute): - def get_label(self, language): - return "OpenStreetMap" - - def get_select(self): - return "?{name}Lat ?{name}Long".replace('{name}', self.name) - - def get_where(self): - return """OPTIONAL { ?item p:{name}/psv:{name} [ - wikibase:geoLatitude ?{name}Lat ; - wikibase:geoLongitude ?{name}Long ] }""".replace( - '{name}', self.name - ) - - def get_group_by(self): - return self.get_select() - - def get_str(self, result, language): - latitude = result.get(self.name + 'Lat') - longitude = result.get(self.name + 'Long') - if latitude and longitude: - return latitude + ' ' + longitude - return None - - def get_geo_url(self, result, osm_zoom=19): - latitude = result.get(self.name + 'Lat') - longitude = result.get(self.name + 'Long') - if latitude and longitude: - return get_earth_coordinates_url(latitude, longitude, osm_zoom) - return None - - -class WDImageAttribute(WDURLAttribute): - - __slots__ = ('priority',) - - def __init__(self, name, url_id=None, priority=100): - super().__init__(name, url_id) - self.priority = priority - - -class WDDateAttribute(WDAttribute): - def get_select(self): - return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name) - - def get_where(self): - # To remove duplicate, add - # FILTER NOT EXISTS { ?item p:{name}/psv:{name}/wikibase:timeValue ?{name}bis FILTER (?{name}bis < ?{name}) } - # this filter is too slow, so the response function ignore duplicate results - # (see the seen_entities variable) - return """OPTIONAL { ?item p:{name}/psv:{name} [ - wikibase:timeValue ?{name} ; - wikibase:timePrecision ?{name}timePrecision ; - wikibase:timeTimezone ?{name}timeZone ; - wikibase:timeCalendarModel ?{name}timeCalendar ] . } - hint:Prior hint:rangeSafe true;""".replace( - '{name}', self.name - ) - - def get_group_by(self): - return self.get_select() - - def format_8(self, value, locale): # pylint: disable=unused-argument - # precision: less than a year - return value - - def format_9(self, value, locale): - year = int(value) - # precision: year - if year < 1584: - if year < 0: - return str(year - 1) - return str(year) - timestamp = isoparse(value) - return format_date(timestamp, format='yyyy', locale=locale) - - def format_10(self, value, locale): - # precision: month - timestamp = isoparse(value) - return format_date(timestamp, format='MMMM y', locale=locale) - - def format_11(self, value, locale): - # precision: day - timestamp = isoparse(value) - return format_date(timestamp, format='full', locale=locale) - - def format_13(self, value, locale): - timestamp = isoparse(value) - # precision: minute - return ( - get_datetime_format(format, locale=locale) - .replace("'", "") - .replace('{0}', format_time(timestamp, 'full', tzinfo=None, locale=locale)) - .replace('{1}', format_date(timestamp, 'short', locale=locale)) - ) - - def format_14(self, value, locale): - # precision: second. - return format_datetime(isoparse(value), format='full', locale=locale) - - DATE_FORMAT = { - '0': ('format_8', 1000000000), - '1': ('format_8', 100000000), - '2': ('format_8', 10000000), - '3': ('format_8', 1000000), - '4': ('format_8', 100000), - '5': ('format_8', 10000), - '6': ('format_8', 1000), - '7': ('format_8', 100), - '8': ('format_8', 10), - '9': ('format_9', 1), # year - '10': ('format_10', 1), # month - '11': ('format_11', 0), # day - '12': ('format_13', 0), # hour (not supported by babel, display minute) - '13': ('format_13', 0), # minute - '14': ('format_14', 0), # second - } - - def get_str(self, result, language): - value = result.get(self.name) - if value == '' or value is None: - return None - precision = result.get(self.name + 'timePrecision') - date_format = WDDateAttribute.DATE_FORMAT.get(precision) - if date_format is not None: - format_method = getattr(self, date_format[0]) - precision = date_format[1] - try: - if precision >= 1: - t = value.split('-') - if value.startswith('-'): - value = '-' + t[1] - else: - value = t[0] - return format_method(value, language) - except Exception: # pylint: disable=broad-except - return value - return value - - -def debug_explain_wikidata_query(query, method='GET'): - if method == 'GET': - http_response = get(SPARQL_EXPLAIN_URL + '&' + urlencode({'query': query}), headers=get_headers()) - else: - http_response = post(SPARQL_EXPLAIN_URL, data={'query': query}, headers=get_headers()) - http_response.raise_for_status() - return http_response.content - - -def init(engine_settings=None): # pylint: disable=unused-argument - # WIKIDATA_PROPERTIES : add unit symbols - WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS) - - # WIKIDATA_PROPERTIES : add property labels - wikidata_property_names = [] - for attribute in get_attributes('en'): - if type(attribute) in (WDAttribute, WDAmountAttribute, WDURLAttribute, WDDateAttribute, WDLabelAttribute): - if attribute.name not in WIKIDATA_PROPERTIES: - wikidata_property_names.append("wd:" + attribute.name) - query = QUERY_PROPERTY_NAMES.replace('%ATTRIBUTES%', " ".join(wikidata_property_names)) - jsonresponse = send_wikidata_query(query) - for result in jsonresponse.get('results', {}).get('bindings', {}): - name = result['name']['value'] - lang = result['name']['xml:lang'] - entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '') - WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize() - - -def fetch_traits(engine_traits: EngineTraits): - """Uses languages evaluated from :py:obj:`wikipedia.fetch_wikimedia_traits - ` and removes - - - ``traits.custom['wiki_netloc']``: wikidata does not have net-locations for - the languages and the list of all - - - ``traits.custom['WIKIPEDIA_LANGUAGES']``: not used in the wikipedia engine - - """ - - fetch_wikimedia_traits(engine_traits) - engine_traits.custom['wiki_netloc'] = {} - engine_traits.custom['WIKIPEDIA_LANGUAGES'] = [] diff --git a/apps/searxng/searx/engines/wikipedia.py b/apps/searxng/searx/engines/wikipedia.py deleted file mode 100755 index b4b7020..0000000 --- a/apps/searxng/searx/engines/wikipedia.py +++ /dev/null @@ -1,317 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""This module implements the Wikipedia engine. Some of this implementations -are shared by other engines: - -- :ref:`wikidata engine` - -The list of supported languages is :py:obj:`fetched ` from -the article linked by :py:obj:`list_of_wikipedias`. - -Unlike traditional search engines, wikipedia does not support one Wikipedia for -all languages, but there is one Wikipedia for each supported language. Some of -these Wikipedias have a LanguageConverter_ enabled -(:py:obj:`rest_v1_summary_url`). - -A LanguageConverter_ (LC) is a system based on language variants that -automatically converts the content of a page into a different variant. A variant -is mostly the same language in a different script. - -- `Wikipedias in multiple writing systems`_ -- `Automatic conversion between traditional and simplified Chinese characters`_ - -PR-2554_: - The Wikipedia link returned by the API is still the same in all cases - (`https://zh.wikipedia.org/wiki/出租車`_) but if your browser's - ``Accept-Language`` is set to any of ``zh``, ``zh-CN``, ``zh-TW``, ``zh-HK`` - or .. Wikipedia's LC automatically returns the desired script in their - web-page. - - - You can test the API here: https://reqbin.com/gesg2kvx - -.. _https://zh.wikipedia.org/wiki/出租車: - https://zh.wikipedia.org/wiki/%E5%87%BA%E7%A7%9F%E8%BB%8A - -To support Wikipedia's LanguageConverter_, a SearXNG request to Wikipedia uses -:py:obj:`get_wiki_params` and :py:obj:`wiki_lc_locale_variants' in the -:py:obj:`fetch_wikimedia_traits` function. - -To test in SearXNG, query for ``!wp 出租車`` with each of the available Chinese -options: - -- ``!wp 出租車 :zh`` should show 出租車 -- ``!wp 出租車 :zh-CN`` should show 出租车 -- ``!wp 出租車 :zh-TW`` should show 計程車 -- ``!wp 出租車 :zh-HK`` should show 的士 -- ``!wp 出租車 :zh-SG`` should show 德士 - -.. _LanguageConverter: - https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter -.. _Wikipedias in multiple writing systems: - https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems -.. _Automatic conversion between traditional and simplified Chinese characters: - https://en.wikipedia.org/wiki/Chinese_Wikipedia#Automatic_conversion_between_traditional_and_simplified_Chinese_characters -.. _PR-2554: https://github.com/searx/searx/pull/2554 - -""" - -import urllib.parse -import babel - -from lxml import html - -from searx import utils -from searx import network as _network -from searx import locales -from searx.enginelib.traits import EngineTraits - -traits: EngineTraits - -# about -about = { - "website": 'https://www.wikipedia.org/', - "wikidata_id": 'Q52', - "official_api_documentation": 'https://en.wikipedia.org/api/', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -send_accept_language_header = True -"""The HTTP ``Accept-Language`` header is needed for wikis where -LanguageConverter_ is enabled.""" - -list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' -"""`List of all wikipedias `_ -""" - -wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth' -"""The *editing depth* of Wikipedia is one of several possible rough indicators -of the encyclopedia's collaborative quality, showing how frequently its articles -are updated. The measurement of depth was introduced after some limitations of -the classic measurement of article count were realized. -""" - -rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}' -""" -`wikipedia rest_v1 summary API`_: - The summary response includes an extract of the first paragraph of the page in - plain text and HTML as well as the type of page. This is useful for page - previews (fka. Hovercards, aka. Popups) on the web and link previews in the - apps. - -HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`): - The desired language variant code for wikis where LanguageConverter_ is - enabled. - -.. _wikipedia rest_v1 summary API: - https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_ - -""" - -wiki_lc_locale_variants = { - "zh": ( - "zh-CN", - "zh-HK", - "zh-MO", - "zh-MY", - "zh-SG", - "zh-TW", - ), - "zh-classical": ("zh-classical",), -} -"""Mapping rule of the LanguageConverter_ to map a language and its variants to -a Locale (used in the HTTP ``Accept-Language`` header). For example see `LC -Chinese`_. - -.. _LC Chinese: - https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems#Chinese -""" - -wikipedia_script_variants = { - "zh": ( - "zh_Hant", - "zh_Hans", - ) -} - - -def get_wiki_params(sxng_locale, eng_traits): - """Returns the Wikipedia language tag and the netloc that fits to the - ``sxng_locale``. To support LanguageConverter_ this function rates a locale - (region) higher than a language (compare :py:obj:`wiki_lc_locale_variants`). - - """ - eng_tag = eng_traits.get_region(sxng_locale, eng_traits.get_language(sxng_locale, 'en')) - wiki_netloc = eng_traits.custom['wiki_netloc'].get(eng_tag, 'en.wikipedia.org') - return eng_tag, wiki_netloc - - -def request(query, params): - """Assemble a request (`wikipedia rest_v1 summary API`_).""" - if query.islower(): - query = query.title() - - _eng_tag, wiki_netloc = get_wiki_params(params['searxng_locale'], traits) - title = urllib.parse.quote(query) - params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title) - - params['raise_for_httperror'] = False - params['soft_max_redirects'] = 2 - - return params - - -# get response from search-request -def response(resp): - - results = [] - if resp.status_code == 404: - return [] - if resp.status_code == 400: - try: - api_result = resp.json() - except Exception: # pylint: disable=broad-except - pass - else: - if ( - api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request' - and api_result['detail'] == 'title-invalid-characters' - ): - return [] - - _network.raise_for_httperror(resp) - - api_result = resp.json() - title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title')) - wikipedia_link = api_result['content_urls']['desktop']['page'] - results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')}) - - if api_result.get('type') == 'standard': - results.append( - { - 'infobox': title, - 'id': wikipedia_link, - 'content': api_result.get('extract', ''), - 'img_src': api_result.get('thumbnail', {}).get('source'), - 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}], - } - ) - - return results - - -# Nonstandard language codes -# -# These Wikipedias use language codes that do not conform to the ISO 639 -# standard (which is how wiki subdomains are chosen nowadays). - -lang_map = locales.LOCALE_BEST_MATCH.copy() -lang_map.update( - { - 'be-tarask': 'bel', - 'ak': 'aka', - 'als': 'gsw', - 'bat-smg': 'sgs', - 'cbk-zam': 'cbk', - 'fiu-vro': 'vro', - 'map-bms': 'map', - 'no': 'nb-NO', - 'nrm': 'nrf', - 'roa-rup': 'rup', - 'nds-nl': 'nds', - #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple) - 'zh-min-nan': 'nan', - 'zh-yue': 'yue', - 'an': 'arg', - } -) - - -def fetch_traits(engine_traits: EngineTraits): - fetch_wikimedia_traits(engine_traits) - print("WIKIPEDIA_LANGUAGES: %s" % len(engine_traits.custom['WIKIPEDIA_LANGUAGES'])) - - -def fetch_wikimedia_traits(engine_traits: EngineTraits): - """Fetch languages from Wikipedia. Not all languages from the - :py:obj:`list_of_wikipedias` are supported by SearXNG locales, only those - known from :py:obj:`searx.locales.LOCALE_NAMES` or those with a minimal - :py:obj:`editing depth `. - - The location of the Wikipedia address of a language is mapped in a - :py:obj:`custom field ` - (``wiki_netloc``). Here is a reduced example: - - .. code:: python - - traits.custom['wiki_netloc'] = { - "en": "en.wikipedia.org", - .. - "gsw": "als.wikipedia.org", - .. - "zh": "zh.wikipedia.org", - "zh-classical": "zh-classical.wikipedia.org" - } - """ - # pylint: disable=too-many-branches - engine_traits.custom['wiki_netloc'] = {} - engine_traits.custom['WIKIPEDIA_LANGUAGES'] = [] - - # insert alias to map from a script or region to a wikipedia variant - - for eng_tag, sxng_tag_list in wikipedia_script_variants.items(): - for sxng_tag in sxng_tag_list: - engine_traits.languages[sxng_tag] = eng_tag - for eng_tag, sxng_tag_list in wiki_lc_locale_variants.items(): - for sxng_tag in sxng_tag_list: - engine_traits.regions[sxng_tag] = eng_tag - - resp = _network.get(list_of_wikipedias) - if not resp.ok: - print("ERROR: response from Wikipedia is not OK.") - - dom = html.fromstring(resp.text) - for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'): - - cols = row.xpath('./td') - if not cols: - continue - cols = [c.text_content().strip() for c in cols] - - depth = float(cols[11].replace('-', '0').replace(',', '')) - articles = int(cols[4].replace(',', '').replace(',', '')) - - eng_tag = cols[3] - wiki_url = row.xpath('./td[4]/a/@href')[0] - wiki_url = urllib.parse.urlparse(wiki_url) - - try: - sxng_tag = locales.language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-')) - except babel.UnknownLocaleError: - # print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag)) - continue - finally: - engine_traits.custom['WIKIPEDIA_LANGUAGES'].append(eng_tag) - - if sxng_tag not in locales.LOCALE_NAMES: - - if articles < 10000: - # exclude languages with too few articles - continue - - if int(depth) < 20: - # Rough indicator of a Wikipedia’s quality, showing how - # frequently its articles are updated. - continue - - conflict = engine_traits.languages.get(sxng_tag) - if conflict: - if conflict != eng_tag: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) - continue - - engine_traits.languages[sxng_tag] = eng_tag - engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc - - engine_traits.custom['WIKIPEDIA_LANGUAGES'].sort() diff --git a/apps/searxng/searx/engines/wolframalpha_api.py b/apps/searxng/searx/engines/wolframalpha_api.py deleted file mode 100755 index 6a2423b..0000000 --- a/apps/searxng/searx/engines/wolframalpha_api.py +++ /dev/null @@ -1,140 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Wolfram|Alpha (Science) -""" - -from lxml import etree -from urllib.parse import urlencode - -# about -about = { - "website": 'https://www.wolframalpha.com', - "wikidata_id": 'Q207006', - "official_api_documentation": 'https://products.wolframalpha.com/api/', - "use_official_api": True, - "require_api_key": False, - "results": 'XML', -} - -# search-url -search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}' -site_url = 'https://www.wolframalpha.com/input/?{query}' -api_key = '' # defined in settings.yml - -# xpath variables -failure_xpath = '/queryresult[attribute::success="false"]' -input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext' -pods_xpath = '//pod' -subpods_xpath = './subpod' -pod_primary_xpath = './@primary' -pod_id_xpath = './@id' -pod_title_xpath = './@title' -plaintext_xpath = './plaintext' -image_xpath = './img' -img_src_xpath = './@src' -img_alt_xpath = './@alt' - -# pods to display as image in infobox -# this pods do return a plaintext, but they look better and are more useful as images -image_pods = {'VisualRepresentation', 'Illustration'} - - -# do search-request -def request(query, params): - params['url'] = search_url.format(query=urlencode({'input': query}), api_key=api_key) - params['headers']['Referer'] = site_url.format(query=urlencode({'i': query})) - - return params - - -# replace private user area characters to make text legible -def replace_pua_chars(text): - pua_chars = { - '\uf522': '\u2192', # right arrow - '\uf7b1': '\u2115', # set of natural numbers - '\uf7b4': '\u211a', # set of rational numbers - '\uf7b5': '\u211d', # set of real numbers - '\uf7bd': '\u2124', # set of integer numbers - '\uf74c': 'd', # differential - '\uf74d': '\u212f', # euler's number - '\uf74e': 'i', # imaginary number - '\uf7d9': '=', - } # equals sign - - for k, v in pua_chars.items(): - text = text.replace(k, v) - - return text - - -# get response from search-request -def response(resp): - results = [] - - search_results = etree.XML(resp.content) - - # return empty array if there are no results - if search_results.xpath(failure_xpath): - return [] - - try: - infobox_title = search_results.xpath(input_xpath)[0].text - except: - infobox_title = "" - - pods = search_results.xpath(pods_xpath) - result_chunks = [] - result_content = "" - for pod in pods: - pod_id = pod.xpath(pod_id_xpath)[0] - pod_title = pod.xpath(pod_title_xpath)[0] - pod_is_result = pod.xpath(pod_primary_xpath) - - subpods = pod.xpath(subpods_xpath) - if not subpods: - continue - - # Appends either a text or an image, depending on which one is more suitable - for subpod in subpods: - content = subpod.xpath(plaintext_xpath)[0].text - image = subpod.xpath(image_xpath) - - if content and pod_id not in image_pods: - - if pod_is_result or not result_content: - if pod_id != "Input": - result_content = "%s: %s" % (pod_title, content) - - # if no input pod was found, title is first plaintext pod - if not infobox_title: - infobox_title = content - - content = replace_pua_chars(content) - result_chunks.append({'label': pod_title, 'value': content}) - - elif image: - result_chunks.append( - { - 'label': pod_title, - 'image': {'src': image[0].xpath(img_src_xpath)[0], 'alt': image[0].xpath(img_alt_xpath)[0]}, - } - ) - - if not result_chunks: - return [] - - title = "Wolfram Alpha (%s)" % infobox_title - - # append infobox - results.append( - { - 'infobox': infobox_title, - 'attributes': result_chunks, - 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}], - } - ) - - # append link to site - results.append({'url': resp.request.headers['Referer'], 'title': title, 'content': result_content}) - - return results diff --git a/apps/searxng/searx/engines/wolframalpha_noapi.py b/apps/searxng/searx/engines/wolframalpha_noapi.py deleted file mode 100755 index bad2560..0000000 --- a/apps/searxng/searx/engines/wolframalpha_noapi.py +++ /dev/null @@ -1,133 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Wolfram|Alpha (Science) -""" - -from json import loads -from time import time -from urllib.parse import urlencode - -from searx.network import get as http_get - -# about -about = { - "website": 'https://www.wolframalpha.com/', - "wikidata_id": 'Q207006', - "official_api_documentation": 'https://products.wolframalpha.com/api/', - "use_official_api": False, - "require_api_key": False, - "results": 'JSON', -} - -# search-url -url = 'https://www.wolframalpha.com/' - -search_url = ( - url + 'input/json.jsp' - '?async=false' - '&banners=raw' - '&debuggingdata=false' - '&format=image,plaintext,imagemap,minput,moutput' - '&formattimeout=2' - '&{query}' - '&output=JSON' - '&parsetimeout=2' - '&proxycode={token}' - '&scantimeout=0.5' - '&sponsorcategories=true' - '&statemethod=deploybutton' -) - -referer_url = url + 'input/?{query}' - -token = {'value': '', 'last_updated': None} - -# pods to display as image in infobox -# this pods do return a plaintext, but they look better and are more useful as images -image_pods = {'VisualRepresentation', 'Illustration', 'Symbol'} - - -# seems, wolframalpha resets its token in every hour -def obtain_token(): - update_time = time() - (time() % 3600) - try: - token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0) - token['value'] = loads(token_response.text)['code'] - token['last_updated'] = update_time - except: - pass - return token - - -def init(engine_settings=None): - obtain_token() - - -# do search-request -def request(query, params): - # obtain token if last update was more than an hour - if time() - (token['last_updated'] or 0) > 3600: - obtain_token() - params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) - params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query})) - - return params - - -# get response from search-request -def response(resp): - results = [] - - resp_json = loads(resp.text) - - if not resp_json['queryresult']['success']: - return [] - - # TODO handle resp_json['queryresult']['assumptions'] - result_chunks = [] - infobox_title = "" - result_content = "" - for pod in resp_json['queryresult']['pods']: - pod_id = pod.get('id', '') - pod_title = pod.get('title', '') - pod_is_result = pod.get('primary', None) - - if 'subpods' not in pod: - continue - - if pod_id == 'Input' or not infobox_title: - infobox_title = pod['subpods'][0]['plaintext'] - - for subpod in pod['subpods']: - if subpod['plaintext'] != '' and pod_id not in image_pods: - # append unless it's not an actual answer - if subpod['plaintext'] != '(requires interactivity)': - result_chunks.append({'label': pod_title, 'value': subpod['plaintext']}) - - if pod_is_result or not result_content: - if pod_id != "Input": - result_content = pod_title + ': ' + subpod['plaintext'] - - elif 'img' in subpod: - result_chunks.append({'label': pod_title, 'image': subpod['img']}) - - if not result_chunks: - return [] - - results.append( - { - 'infobox': infobox_title, - 'attributes': result_chunks, - 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}], - } - ) - - results.append( - { - 'url': resp.request.headers['Referer'], - 'title': 'Wolfram|Alpha (' + infobox_title + ')', - 'content': result_content, - } - ) - - return results diff --git a/apps/searxng/searx/engines/wordnik.py b/apps/searxng/searx/engines/wordnik.py deleted file mode 100755 index 21eaecc..0000000 --- a/apps/searxng/searx/engines/wordnik.py +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -"""Wordnik (general) - -""" - -from lxml.html import fromstring -from searx.utils import extract_text -from searx.network import raise_for_httperror - -# about -about = { - "website": 'https://www.wordnik.com', - "wikidata_id": 'Q8034401', - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -categories = ['general'] -paging = False - -URL = 'https://www.wordnik.com' -SEARCH_URL = URL + '/words/{query}' - - -def request(query, params): - params['url'] = SEARCH_URL.format(query=query) - logger.debug(f"query_url --> {params['url']}") - return params - - -def response(resp): - results = [] - - raise_for_httperror(resp) - dom = fromstring(resp.text) - word = extract_text(dom.xpath('//*[@id="headword"]/text()')) - - definitions = [] - for src in dom.xpath('//*[@id="define"]//h3[@class="source"]'): - src_text = extract_text(src).strip() - if src_text.startswith('from '): - src_text = src_text[5:] - - src_defs = [] - for def_item in src.xpath('following-sibling::ul[1]/li'): - def_abbr = extract_text(def_item.xpath('.//abbr')).strip() - def_text = extract_text(def_item).strip() - if def_abbr: - def_text = def_text[len(def_abbr) :].strip() - src_defs.append((def_abbr, def_text)) - - definitions.append((src_text, src_defs)) - - if not definitions: - return results - - infobox = '' - for src_text, src_defs in definitions: - infobox += f"{src_text}" - infobox += "
    " - for def_abbr, def_text in src_defs: - if def_abbr: - def_abbr += ": " - infobox += f"
  • {def_abbr} {def_text}
  • " - infobox += "
" - - results.append( - { - 'infobox': word, - 'content': infobox, - } - ) - - return results diff --git a/apps/searxng/searx/engines/wttr.py b/apps/searxng/searx/engines/wttr.py deleted file mode 100755 index 2eaee62..0000000 --- a/apps/searxng/searx/engines/wttr.py +++ /dev/null @@ -1,136 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""wttr.in (weather forecast service)""" - -from json import loads -from urllib.parse import quote -from flask_babel import gettext - -about = { - "website": "https://wttr.in", - "wikidata_id": "Q107586666", - "official_api_documentation": "https://github.com/chubin/wttr.in#json-output", - "use_official_api": True, - "require_api_key": False, - "results": "JSON", -} - -categories = ["weather"] - -url = "https://wttr.in/{query}?format=j1&lang={lang}" - - -def get_weather_condition_key(lang): - if lang == "en": - return "weatherDesc" - - return "lang_" + lang.lower() - - -def generate_day_table(day): - res = "" - - res += f"{gettext('Average temp.')}{day['avgtempC']}°C / {day['avgtempF']}°F" - res += f"{gettext('Min temp.')}{day['mintempC']}°C / {day['mintempF']}°F" - res += f"{gettext('Max temp.')}{day['maxtempC']}°C / {day['maxtempF']}°F" - res += f"{gettext('UV index')}{day['uvIndex']}" - res += f"{gettext('Sunrise')}{day['astronomy'][0]['sunrise']}" - res += f"{gettext('Sunset')}{day['astronomy'][0]['sunset']}" - - return res - - -def generate_condition_table(condition, lang, current=False): - res = "" - - if current: - key = "temp_" - else: - key = "temp" - - res += ( - f"{gettext('Condition')}" - f"{condition[get_weather_condition_key(lang)][0]['value']}" - ) - res += ( - f"{gettext('Temperature')}" - f"{condition[key+'C']}°C / {condition[key+'F']}°F" - ) - res += ( - f"{gettext('Feels like')}{condition['FeelsLikeC']}°C / {condition['FeelsLikeF']}°F" - ) - res += ( - f"{gettext('Wind')}{condition['winddir16Point']} — " - f"{condition['windspeedKmph']} km/h / {condition['windspeedMiles']} mph" - ) - res += ( - f"{gettext('Visibility')}{condition['visibility']} km / {condition['visibilityMiles']} mi" - ) - res += f"{gettext('Humidity')}{condition['humidity']}%" - - return res - - -def request(query, params): - if query.replace('/', '') in [":help", ":bash.function", ":translation"]: - return None - - if params["language"] == "all": - params["language"] = "en" - else: - params["language"] = params["language"].split("-")[0] - - params["url"] = url.format(query=quote(query), lang=params["language"]) - - params["raise_for_httperror"] = False - - return params - - -def response(resp): - results = [] - - if resp.status_code == 404: - return [] - - result = loads(resp.text) - - current = result["current_condition"][0] - location = result['nearest_area'][0] - - forecast_indices = {3: gettext('Morning'), 4: gettext('Noon'), 6: gettext('Evening'), 7: gettext('Night')} - - title = f"{location['areaName'][0]['value']}, {location['region'][0]['value']}" - - infobox = f"

{gettext('Current condition')}

" - - infobox += generate_condition_table(current, resp.search_params['language'], True) - - infobox += "
" - - for day in result["weather"]: - infobox += f"

{day['date']}

" - - infobox += "" - - infobox += generate_day_table(day) - - infobox += "
" - - infobox += "" - - for time in forecast_indices.items(): - infobox += f"" - - infobox += generate_condition_table(day['hourly'][time[0]], resp.search_params['language']) - - infobox += "
{time[1]}
" - - results.append( - { - "infobox": title, - "content": infobox, - } - ) - - return results diff --git a/apps/searxng/searx/engines/www1x.py b/apps/searxng/searx/engines/www1x.py deleted file mode 100755 index a7ec06f..0000000 --- a/apps/searxng/searx/engines/www1x.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""1x (Images) - -""" - -from urllib.parse import urlencode, urljoin -from lxml import html, etree - -from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex - -# about -about = { - "website": 'https://1x.com/', - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['images'] -paging = False - -# search-url -base_url = 'https://1x.com' -search_url = base_url + '/backend/search.php?{query}' -gallery_url = 'https://gallery.1x.com/' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query})) - - return params - - -# get response from search-request -def response(resp): - results = [] - xmldom = etree.fromstring(resp.content) - xmlsearchresult = eval_xpath_getindex(xmldom, '//data', 0) - dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div') - for link in eval_xpath_list(dom, '//a'): - url = urljoin(base_url, link.attrib.get('href')) - title = extract_text(link) - thumbnail_src = urljoin( - gallery_url, (eval_xpath_getindex(link, './/img', 0).attrib['src']).replace(base_url, '') - ) - # append result - results.append( - { - 'url': url, - 'title': title, - 'img_src': thumbnail_src, - 'content': '', - 'thumbnail_src': thumbnail_src, - 'template': 'images.html', - } - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/xpath.py b/apps/searxng/searx/engines/xpath.py deleted file mode 100755 index 51ddcda..0000000 --- a/apps/searxng/searx/engines/xpath.py +++ /dev/null @@ -1,311 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""The XPath engine is a *generic* engine with which it is possible to configure -engines in the settings. - -.. _XPath selector: https://quickref.me/xpath.html#xpath-selectors - -Configuration -============= - -Request: - -- :py:obj:`search_url` -- :py:obj:`lang_all` -- :py:obj:`soft_max_redirects` -- :py:obj:`cookies` -- :py:obj:`headers` - -Paging: - -- :py:obj:`paging` -- :py:obj:`page_size` -- :py:obj:`first_page_num` - -Time Range: - -- :py:obj:`time_range_support` -- :py:obj:`time_range_url` -- :py:obj:`time_range_map` - -Safe-Search: - -- :py:obj:`safe_search_support` -- :py:obj:`safe_search_map` - -Response: - -- :py:obj:`no_result_for_http_status` - -`XPath selector`_: - -- :py:obj:`results_xpath` -- :py:obj:`url_xpath` -- :py:obj:`title_xpath` -- :py:obj:`content_xpath` -- :py:obj:`thumbnail_xpath` -- :py:obj:`suggestion_xpath` - - -Example -======= - -Here is a simple example of a XPath engine configured in the :ref:`settings -engine` section, further read :ref:`engines-dev`. - -.. code:: yaml - - - name : bitbucket - engine : xpath - paging : True - search_url : https://bitbucket.org/repo/all/{pageno}?name={query} - url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href - title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"] - content_xpath : //article[@class="repo-summary"]/p - -Implementations -=============== - -""" - -from urllib.parse import urlencode - -from lxml import html -from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list -from searx.network import raise_for_httperror - -search_url = None -""" -Search URL of the engine. Example:: - - https://example.org/?search={query}&page={pageno}{time_range}{safe_search} - -Replacements are: - -``{query}``: - Search terms from user. - -``{pageno}``: - Page number if engine supports pagging :py:obj:`paging` - -``{lang}``: - ISO 639-1 language code (en, de, fr ..) - -``{time_range}``: - :py:obj:`URL parameter ` if engine :py:obj:`supports time - range `. The value for the parameter is taken from - :py:obj:`time_range_map`. - -``{safe_search}``: - Safe-search :py:obj:`URL parameter ` if engine - :py:obj:`supports safe-search `. The ``{safe_search}`` - replacement is taken from the :py:obj:`safes_search_map`. Filter results:: - - 0: none, 1: moderate, 2:strict - - If not supported, the URL parameter is an empty string. - -""" - -lang_all = 'en' -'''Replacement ``{lang}`` in :py:obj:`search_url` if language ``all`` is -selected. -''' - -no_result_for_http_status = [] -'''Return empty result for these HTTP status codes instead of throwing an error. - -.. code:: yaml - - no_result_for_http_status: [] -''' - -soft_max_redirects = 0 -'''Maximum redirects, soft limit. Record an error but don't stop the engine''' - -results_xpath = '' -'''`XPath selector`_ for the list of result items''' - -url_xpath = None -'''`XPath selector`_ of result's ``url``.''' - -content_xpath = None -'''`XPath selector`_ of result's ``content``.''' - -title_xpath = None -'''`XPath selector`_ of result's ``title``.''' - -thumbnail_xpath = False -'''`XPath selector`_ of result's ``img_src``.''' - -suggestion_xpath = '' -'''`XPath selector`_ of result's ``suggestion``.''' - -cached_xpath = '' -cached_url = '' - -cookies = {} -'''Some engines might offer different result based on cookies. -Possible use-case: To set safesearch cookie.''' - -headers = {} -'''Some engines might offer different result based headers. Possible use-case: -To set header to moderate.''' - -paging = False -'''Engine supports paging [True or False].''' - -page_size = 1 -'''Number of results on each page. Only needed if the site requires not a page -number, but an offset.''' - -first_page_num = 1 -'''Number of the first page (usually 0 or 1).''' - -time_range_support = False -'''Engine supports search time range.''' - -time_range_url = '&hours={time_range_val}' -'''Time range URL parameter in the in :py:obj:`search_url`. If no time range is -requested by the user, the URL parameter is an empty string. The -``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`. - -.. code:: yaml - - time_range_url : '&days={time_range_val}' -''' - -time_range_map = { - 'day': 24, - 'week': 24 * 7, - 'month': 24 * 30, - 'year': 24 * 365, -} -'''Maps time range value from user to ``{time_range_val}`` in -:py:obj:`time_range_url`. - -.. code:: yaml - - time_range_map: - day: 1 - week: 7 - month: 30 - year: 365 -''' - -safe_search_support = False -'''Engine supports safe-search.''' - -safe_search_map = {0: '&filter=none', 1: '&filter=moderate', 2: '&filter=strict'} -'''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`. - -.. code:: yaml - - safesearch: true - safes_search_map: - 0: '&filter=none' - 1: '&filter=moderate' - 2: '&filter=strict' - -''' - - -def request(query, params): - '''Build request parameters (see :ref:`engine request`).''' - lang = lang_all - if params['language'] != 'all': - lang = params['language'][:2] - - time_range = '' - if params.get('time_range'): - time_range_val = time_range_map.get(params.get('time_range')) - time_range = time_range_url.format(time_range_val=time_range_val) - - safe_search = '' - if params['safesearch']: - safe_search = safe_search_map[params['safesearch']] - - fargs = { - 'query': urlencode({'q': query})[2:], - 'lang': lang, - 'pageno': (params['pageno'] - 1) * page_size + first_page_num, - 'time_range': time_range, - 'safe_search': safe_search, - } - - params['cookies'].update(cookies) - params['headers'].update(headers) - - params['url'] = search_url.format(**fargs) - params['soft_max_redirects'] = soft_max_redirects - - params['raise_for_httperror'] = False - - return params - - -def response(resp): # pylint: disable=too-many-branches - '''Scrap *results* from the response (see :ref:`engine results`).''' - if no_result_for_http_status and resp.status_code in no_result_for_http_status: - return [] - - raise_for_httperror(resp) - - results = [] - dom = html.fromstring(resp.text) - is_onion = 'onions' in categories - - if results_xpath: - for result in eval_xpath_list(dom, results_xpath): - - url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) - title = extract_text(eval_xpath_list(result, title_xpath, min_len=1)) - content = extract_text(eval_xpath_list(result, content_xpath)) - tmp_result = {'url': url, 'title': title, 'content': content} - - # add thumbnail if available - if thumbnail_xpath: - thumbnail_xpath_result = eval_xpath_list(result, thumbnail_xpath) - if len(thumbnail_xpath_result) > 0: - tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url) - - # add alternative cached url if available - if cached_xpath: - tmp_result['cached_url'] = cached_url + extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) - - if is_onion: - tmp_result['is_onion'] = True - - results.append(tmp_result) - - else: - if cached_xpath: - for url, title, content, cached in zip( - (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), - map(extract_text, eval_xpath_list(dom, title_xpath)), - map(extract_text, eval_xpath_list(dom, content_xpath)), - map(extract_text, eval_xpath_list(dom, cached_xpath)), - ): - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'cached_url': cached_url + cached, - 'is_onion': is_onion, - } - ) - else: - for url, title, content in zip( - (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), - map(extract_text, eval_xpath_list(dom, title_xpath)), - map(extract_text, eval_xpath_list(dom, content_xpath)), - ): - results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion}) - - if suggestion_xpath: - for suggestion in eval_xpath(dom, suggestion_xpath): - results.append({'suggestion': extract_text(suggestion)}) - - logger.debug("found %s results", len(results)) - return results diff --git a/apps/searxng/searx/engines/yacy.py b/apps/searxng/searx/engines/yacy.py deleted file mode 100755 index 0603a45..0000000 --- a/apps/searxng/searx/engines/yacy.py +++ /dev/null @@ -1,161 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""YaCy_ is a free distributed search engine, built on the principles of -peer-to-peer (P2P) networks. - -API: Dev:APIyacysearch_ - -Releases: - -- https://github.com/yacy/yacy_search_server/tags -- https://download.yacy.net/ - -.. _Yacy: https://yacy.net/ -.. _Dev:APIyacysearch: https://wiki.yacy.net/index.php/Dev:APIyacysearch - -Configuration -============= - -The engine has the following (additional) settings: - -.. code:: yaml - - - name: yacy - engine: yacy - shortcut: ya - base_url: http://localhost:8090 - # Yacy search mode. 'global' or 'local'. - search_mode: 'global' - number_of_results: 5 - http_digest_auth_user: "" - http_digest_auth_pass: "" - - -Implementations -=============== -""" -# pylint: disable=fixme - -from json import loads -from urllib.parse import urlencode -from dateutil import parser - -from httpx import DigestAuth - -from searx.utils import html_to_text - -# about -about = { - "website": 'https://yacy.net/', - "wikidata_id": 'Q1759675', - "official_api_documentation": 'https://wiki.yacy.net/index.php/Dev:API', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['general', 'images'] # TODO , 'music', 'videos', 'files' -paging = True -number_of_results = 5 -http_digest_auth_user = "" -http_digest_auth_pass = "" -search_mode = 'global' -"""Yacy search mode ``global`` or ``local``. By default, Yacy operates in ``global`` -mode. - -``global`` - Peer-to-Peer search - -``local`` - Privacy or Stealth mode, restricts the search to local yacy instance. -""" -# search-url -base_url = 'http://localhost:8090' -search_url = ( - '/yacysearch.json?{query}' - '&startRecord={offset}' - '&maximumRecords={limit}' - '&contentdom={search_type}' - '&resource={resource}' -) - -# yacy specific type-definitions -search_types = {'general': 'text', 'images': 'image', 'files': 'app', 'music': 'audio', 'videos': 'video'} - - -def request(query, params): - offset = (params['pageno'] - 1) * number_of_results - search_type = search_types.get(params.get('category'), '0') - - params['url'] = base_url + search_url.format( - query=urlencode({'query': query}), - offset=offset, - limit=number_of_results, - search_type=search_type, - resource=search_mode, - ) - - if http_digest_auth_user and http_digest_auth_pass: - params['auth'] = DigestAuth(http_digest_auth_user, http_digest_auth_pass) - - # add language tag if specified - if params['language'] != 'all': - params['url'] += '&lr=lang_' + params['language'].split('-')[0] - - return params - - -def response(resp): - results = [] - - raw_search_results = loads(resp.text) - - # return empty array if there are no results - if not raw_search_results: - return [] - - search_results = raw_search_results.get('channels', []) - - if len(search_results) == 0: - return [] - - for result in search_results[0].get('items', []): - # parse image results - if resp.search_params.get('category') == 'images': - result_url = '' - if 'url' in result: - result_url = result['url'] - elif 'link' in result: - result_url = result['link'] - else: - continue - - # append result - results.append( - { - 'url': result_url, - 'title': result['title'], - 'content': '', - 'img_src': result['image'], - 'template': 'images.html', - } - ) - - # parse general results - else: - publishedDate = parser.parse(result['pubDate']) - - # append result - results.append( - { - 'url': result['link'], - 'title': result['title'], - 'content': html_to_text(result['description']), - 'publishedDate': publishedDate, - } - ) - - # TODO parse video, audio and file results - - return results diff --git a/apps/searxng/searx/engines/yahoo.py b/apps/searxng/searx/engines/yahoo.py deleted file mode 100755 index 0fdeace..0000000 --- a/apps/searxng/searx/engines/yahoo.py +++ /dev/null @@ -1,188 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Yahoo Search (Web) - -Languages are supported by mapping the language to a domain. If domain is not -found in :py:obj:`lang2domain` URL ``.search.yahoo.com`` is used. - -""" - -from urllib.parse import ( - unquote, - urlencode, -) -from lxml import html - -from searx.utils import ( - eval_xpath_getindex, - eval_xpath_list, - extract_text, -) -from searx.enginelib.traits import EngineTraits - -traits: EngineTraits - -# about -about = { - "website": 'https://search.yahoo.com/', - "wikidata_id": None, - "official_api_documentation": 'https://developer.yahoo.com/api/', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['general', 'web'] -paging = True -time_range_support = True -# send_accept_language_header = True - -time_range_dict = { - 'day': ('1d', 'd'), - 'week': ('1w', 'w'), - 'month': ('1m', 'm'), -} - -lang2domain = { - 'zh_chs': 'hk.search.yahoo.com', - 'zh_cht': 'tw.search.yahoo.com', - 'any': 'search.yahoo.com', - 'en': 'search.yahoo.com', - 'bg': 'search.yahoo.com', - 'cs': 'search.yahoo.com', - 'da': 'search.yahoo.com', - 'el': 'search.yahoo.com', - 'et': 'search.yahoo.com', - 'he': 'search.yahoo.com', - 'hr': 'search.yahoo.com', - 'ja': 'search.yahoo.com', - 'ko': 'search.yahoo.com', - 'sk': 'search.yahoo.com', - 'sl': 'search.yahoo.com', -} -"""Map language to domain""" - -locale_aliases = { - 'zh': 'zh_Hans', - 'zh-HK': 'zh_Hans', - 'zh-CN': 'zh_Hans', # dead since 2015 / routed to hk.search.yahoo.com - 'zh-TW': 'zh_Hant', -} - - -def request(query, params): - """build request""" - - lang = locale_aliases.get(params['language'], None) - if not lang: - lang = params['language'].split('-')[0] - lang = traits.get_language(lang, traits.all_locale) - - offset = (params['pageno'] - 1) * 7 + 1 - age, btf = time_range_dict.get(params['time_range'], ('', '')) - - args = urlencode( - { - 'p': query, - 'ei': 'UTF-8', - 'fl': 1, - 'vl': 'lang_' + lang, - 'btf': btf, - 'fr2': 'time', - 'age': age, - 'b': offset, - 'xargs': 0, - } - ) - - domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang) - params['url'] = 'https://%s/search?%s' % (domain, args) - return params - - -def parse_url(url_string): - """remove yahoo-specific tracking-url""" - - endings = ['/RS', '/RK'] - endpositions = [] - start = url_string.find('http', url_string.find('/RU=') + 1) - - for ending in endings: - endpos = url_string.rfind(ending) - if endpos > -1: - endpositions.append(endpos) - - if start == 0 or len(endpositions) == 0: - return url_string - - end = min(endpositions) - return unquote(url_string[start:end]) - - -def response(resp): - """parse response""" - - results = [] - dom = html.fromstring(resp.text) - - # parse results - for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'): - url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None) - if url is None: - continue - url = parse_url(url) - - title = eval_xpath_getindex(result, './/h3/a', 0, default=None) - if title is None: - continue - offset = len(extract_text(title.xpath('span'))) - title = extract_text(title)[offset:] - - content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='') - content = extract_text(content, allow_none=True) - - # append result - results.append({'url': url, 'title': title, 'content': content}) - - for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'): - # append suggestion - results.append({'suggestion': extract_text(suggestion)}) - - return results - - -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages from yahoo""" - - # pylint: disable=import-outside-toplevel - import babel - from searx import network - from searx.locales import language_tag - - engine_traits.all_locale = 'any' - - resp = network.get('https://search.yahoo.com/preferences/languages') - if not resp.ok: - print("ERROR: response from peertube is not OK.") - - dom = html.fromstring(resp.text) - offset = len('lang_') - - eng2sxng = {'zh_chs': 'zh_Hans', 'zh_cht': 'zh_Hant'} - - for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): - eng_tag = val[offset:] - - try: - sxng_tag = language_tag(babel.Locale.parse(eng2sxng.get(eng_tag, eng_tag))) - except babel.UnknownLocaleError: - print('ERROR: unknown language --> %s' % eng_tag) - continue - - conflict = engine_traits.languages.get(sxng_tag) - if conflict: - if conflict != eng_tag: - print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) - continue - engine_traits.languages[sxng_tag] = eng_tag diff --git a/apps/searxng/searx/engines/yahoo_news.py b/apps/searxng/searx/engines/yahoo_news.py deleted file mode 100755 index 00f208b..0000000 --- a/apps/searxng/searx/engines/yahoo_news.py +++ /dev/null @@ -1,104 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Yahoo (News) - -Yahoo News is "English only" and do not offer localized nor language queries. - -""" - -# pylint: disable=invalid-name - -import re -from urllib.parse import urlencode -from datetime import datetime, timedelta -from dateutil import parser -from lxml import html - -from searx.utils import ( - eval_xpath_list, - eval_xpath_getindex, - extract_text, -) - -from searx.engines.yahoo import parse_url - -# about -about = { - "website": 'https://news.yahoo.com', - "wikidata_id": 'Q3044717', - "official_api_documentation": 'https://developer.yahoo.com/api/', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -language_support = False -time_range_support = False -safesearch = False -paging = True -categories = ['news'] - -# search-url -search_url = ( - # fmt: off - 'https://news.search.yahoo.com/search' - '?{query}&b={offset}' - # fmt: on -) - -AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)') -AGO_TIMEDELTA = { - 'minute': timedelta(minutes=1), - 'hour': timedelta(hours=1), - 'day': timedelta(days=1), - 'week': timedelta(days=7), - 'month': timedelta(days=30), - 'year': timedelta(days=365), -} - - -def request(query, params): - offset = (params['pageno'] - 1) * 10 + 1 - - params['url'] = search_url.format(offset=offset, query=urlencode({'p': query})) - logger.debug("query_url --> %s", params['url']) - return params - - -def response(resp): - results = [] - dom = html.fromstring(resp.text) - - # parse results - for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'): - - url = eval_xpath_getindex(result, './/h4/a/@href', 0, None) - if url is None: - continue - url = parse_url(url) - title = extract_text(result.xpath('.//h4/a')) - content = extract_text(result.xpath('.//p')) - img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None) - - item = {'url': url, 'title': title, 'content': content, 'img_src': img_src} - - pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]')) - ago = AGO_RE.search(pub_date) - if ago: - number = int(ago.group(1)) - delta = AGO_TIMEDELTA[ago.group(2)] - pub_date = datetime.now() - delta * number - else: - try: - pub_date = parser.parse(pub_date) - except parser.ParserError: - pub_date = None - - if pub_date is not None: - item['publishedDate'] = pub_date - results.append(item) - - for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'): - results.append({'suggestion': extract_text(suggestion)}) - - return results diff --git a/apps/searxng/searx/engines/youtube_api.py b/apps/searxng/searx/engines/youtube_api.py deleted file mode 100755 index 1b332a9..0000000 --- a/apps/searxng/searx/engines/youtube_api.py +++ /dev/null @@ -1,87 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Youtube (Videos) -""" - -from json import loads -from dateutil import parser -from urllib.parse import urlencode -from searx.exceptions import SearxEngineAPIException - -# about -about = { - "website": 'https://www.youtube.com/', - "wikidata_id": 'Q866', - "official_api_documentation": 'https://developers.google.com/youtube/v3/docs/search/list?apix=true', - "use_official_api": True, - "require_api_key": False, - "results": 'JSON', -} - -# engine dependent config -categories = ['videos', 'music'] -paging = False -api_key = None - -# search-url -base_url = 'https://www.googleapis.com/youtube/v3/search' -search_url = base_url + '?part=snippet&{query}&maxResults=20&key={api_key}' -base_youtube_url = 'https://www.youtube.com/watch?v=' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query}), api_key=api_key) - - # add language tag if specified - if params['language'] != 'all': - params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0] - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_results = loads(resp.text) - - if 'error' in search_results and 'message' in search_results['error']: - raise SearxEngineAPIException(search_results['error']['message']) - - # return empty array if there are no results - if 'items' not in search_results: - return [] - - # parse results - for result in search_results['items']: - videoid = result['id']['videoId'] - - title = result['snippet']['title'] - content = '' - thumbnail = '' - - pubdate = result['snippet']['publishedAt'] - publishedDate = parser.parse(pubdate) - - thumbnail = result['snippet']['thumbnails']['high']['url'] - - content = result['snippet']['description'] - - url = base_youtube_url + videoid - - # append result - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'template': 'videos.html', - 'publishedDate': publishedDate, - 'iframe_src': "https://www.youtube-nocookie.com/embed/" + videoid, - 'thumbnail': thumbnail, - } - ) - - # return results - return results diff --git a/apps/searxng/searx/engines/youtube_noapi.py b/apps/searxng/searx/engines/youtube_noapi.py deleted file mode 100755 index 7992adf..0000000 --- a/apps/searxng/searx/engines/youtube_noapi.py +++ /dev/null @@ -1,171 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Youtube (Videos) -""" - -from functools import reduce -from json import loads, dumps -from urllib.parse import quote_plus - -# about -about = { - "website": 'https://www.youtube.com/', - "wikidata_id": 'Q866', - "official_api_documentation": 'https://developers.google.com/youtube/v3/docs/search/list?apix=true', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['videos', 'music'] -paging = True -language_support = False -time_range_support = True - -# search-url -base_url = 'https://www.youtube.com/results' -search_url = base_url + '?search_query={query}&page={page}' -time_range_url = '&sp=EgII{time_range}%253D%253D' -# the key seems to be constant -next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' -time_range_dict = {'day': 'Ag', 'week': 'Aw', 'month': 'BA', 'year': 'BQ'} - -base_youtube_url = 'https://www.youtube.com/watch?v=' - - -# do search-request -def request(query, params): - params['cookies']['CONSENT'] = "YES+" - if not params['engine_data'].get('next_page_token'): - params['url'] = search_url.format(query=quote_plus(query), page=params['pageno']) - if params['time_range'] in time_range_dict: - params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) - else: - params['url'] = next_page_url - params['method'] = 'POST' - params['data'] = dumps( - { - 'context': {"client": {"clientName": "WEB", "clientVersion": "2.20210310.12.01"}}, - 'continuation': params['engine_data']['next_page_token'], - } - ) - params['headers']['Content-Type'] = 'application/json' - - return params - - -# get response from search-request -def response(resp): - if resp.search_params.get('engine_data'): - return parse_next_page_response(resp.text) - return parse_first_page_response(resp.text) - - -def parse_next_page_response(response_text): - results = [] - result_json = loads(response_text) - for section in ( - result_json['onResponseReceivedCommands'][0] - .get('appendContinuationItemsAction')['continuationItems'][0] - .get('itemSectionRenderer')['contents'] - ): - if 'videoRenderer' not in section: - continue - section = section['videoRenderer'] - content = "-" - if 'descriptionSnippet' in section: - content = ' '.join(x['text'] for x in section['descriptionSnippet']['runs']) - results.append( - { - 'url': base_youtube_url + section['videoId'], - 'title': ' '.join(x['text'] for x in section['title']['runs']), - 'content': content, - 'author': section['ownerText']['runs'][0]['text'], - 'length': section['lengthText']['simpleText'], - 'template': 'videos.html', - 'iframe_src': 'https://www.youtube-nocookie.com/embed/' + section['videoId'], - 'thumbnail': section['thumbnail']['thumbnails'][-1]['url'], - } - ) - try: - token = ( - result_json['onResponseReceivedCommands'][0] - .get('appendContinuationItemsAction')['continuationItems'][1] - .get('continuationItemRenderer')['continuationEndpoint'] - .get('continuationCommand')['token'] - ) - results.append( - { - "engine_data": token, - "key": "next_page_token", - } - ) - except: - pass - - return results - - -def parse_first_page_response(response_text): - results = [] - results_data = response_text[response_text.find('ytInitialData') :] - results_data = results_data[results_data.find('{') : results_data.find(';')] - results_json = loads(results_data) if results_data else {} - sections = ( - results_json.get('contents', {}) - .get('twoColumnSearchResultsRenderer', {}) - .get('primaryContents', {}) - .get('sectionListRenderer', {}) - .get('contents', []) - ) - - for section in sections: - if "continuationItemRenderer" in section: - next_page_token = ( - section["continuationItemRenderer"] - .get("continuationEndpoint", {}) - .get("continuationCommand", {}) - .get("token", "") - ) - if next_page_token: - results.append( - { - "engine_data": next_page_token, - "key": "next_page_token", - } - ) - for video_container in section.get('itemSectionRenderer', {}).get('contents', []): - video = video_container.get('videoRenderer', {}) - videoid = video.get('videoId') - if videoid is not None: - url = base_youtube_url + videoid - thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg' - title = get_text_from_json(video.get('title', {})) - content = get_text_from_json(video.get('descriptionSnippet', {})) - author = get_text_from_json(video.get('ownerText', {})) - length = get_text_from_json(video.get('lengthText', {})) - - # append result - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'author': author, - 'length': length, - 'template': 'videos.html', - 'iframe_src': 'https://www.youtube-nocookie.com/embed/' + videoid, - 'thumbnail': thumbnail, - } - ) - - # return results - return results - - -def get_text_from_json(element): - if 'runs' in element: - return reduce(lambda a, b: a + b.get('text', ''), element.get('runs'), '') - else: - return element.get('simpleText', '') diff --git a/apps/searxng/searx/engines/zlibrary.py b/apps/searxng/searx/engines/zlibrary.py deleted file mode 100755 index 813d52f..0000000 --- a/apps/searxng/searx/engines/zlibrary.py +++ /dev/null @@ -1,221 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""`Z-Library`_ (abbreviated as z-lib, formerly BookFinder) is a shadow library -project for file-sharing access to scholarly journal articles, academic texts -and general-interest books. It began as a mirror of Library Genesis, from which -most of its books originate. - -.. _Z-Library: https://zlibrary-global.se/ - -Configuration -============= - -The engine has the following additional settings: - -- :py:obj:`zlib_year_from` -- :py:obj:`zlib_year_to` -- :py:obj:`zlib_ext` - -With this options a SearXNG maintainer is able to configure **additional** -engines for specific searches in Z-Library. For example a engine to search -only for EPUB from 2010 to 2020. - -.. code:: yaml - - - name: z-library 2010s epub - engine: zlibrary - shortcut: zlib2010s - zlib_year_from: '2010' - zlib_year_to: '2020' - zlib_ext: 'EPUB' - -Implementations -=============== - -""" -from __future__ import annotations -from typing import TYPE_CHECKING -from typing import List, Dict, Any, Optional -from datetime import datetime -from urllib.parse import quote -from lxml import html -from flask_babel import gettext - -from searx.utils import extract_text, eval_xpath, eval_xpath_list -from searx.enginelib.traits import EngineTraits -from searx.data import ENGINE_TRAITS - -if TYPE_CHECKING: - import httpx - import logging - - logger: logging.Logger - -# about -about: Dict[str, Any] = { - "website": "https://zlibrary-global.se", - "wikidata_id": "Q104863992", - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": "HTML", -} - -categories: List[str] = ["files"] -paging: bool = True -base_url: str = "https://zlibrary-global.se" - -zlib_year_from: str = "" -"""Filter z-library's results by year from. E.g '2010'. -""" - -zlib_year_to: str = "" -"""Filter z-library's results by year to. E.g. '2010'. -""" - -zlib_ext: str = "" -"""Filter z-library's results by a file ending. Common filters for example are -``PDF`` and ``EPUB``. -""" - - -def init(engine_settings=None) -> None: # pylint: disable=unused-argument - """Check of engine's settings.""" - traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"]) - - if zlib_ext and zlib_ext not in traits.custom["ext"]: - raise ValueError(f"invalid setting ext: {zlib_ext}") - if zlib_year_from and zlib_year_from not in traits.custom["year_from"]: - raise ValueError(f"invalid setting year_from: {zlib_year_from}") - if zlib_year_to and zlib_year_to not in traits.custom["year_to"]: - raise ValueError(f"invalid setting year_to: {zlib_year_to}") - - -def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]: - lang: str = traits.get_language(params["language"], traits.all_locale) # type: ignore - search_url: str = ( - base_url - + "/s/{search_query}/?page={pageno}" - + "&yearFrom={zlib_year_from}" - + "&yearTo={zlib_year_to}" - + "&languages[]={lang}" - + "&extensions[]={zlib_ext}" - ) - params["url"] = search_url.format( - search_query=quote(query), - pageno=params["pageno"], - lang=lang, - zlib_year_from=zlib_year_from, - zlib_year_to=zlib_year_to, - zlib_ext=zlib_ext, - ) - return params - - -def response(resp: httpx.Response) -> List[Dict[str, Any]]: - results: List[Dict[str, Any]] = [] - dom = html.fromstring(resp.text) - - for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'): - results.append(_parse_result(item)) - - return results - - -def _text(item, selector: str) -> str | None: - return extract_text(eval_xpath(item, selector)) - - -i18n_language = gettext("Language") -i18n_book_rating = gettext("Book rating") -i18n_file_quality = gettext("File quality") - - -def _parse_result(item) -> Dict[str, Any]: - - author_elements = eval_xpath_list(item, './/div[@class="authors"]//a[@itemprop="author"]') - - result = { - "template": "paper.html", - "url": base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0], - "title": _text(item, './/*[@itemprop="name"]'), - "authors": [extract_text(author) for author in author_elements], - "publisher": _text(item, './/a[@title="Publisher"]'), - "type": _text(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]'), - "img_src": _text(item, './/img[contains(@class, "cover")]/@data-src'), - } - - year = _text(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]') - if year: - result["publishedDate"] = datetime.strptime(year, '%Y') - - content = [] - language = _text(item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]') - if language: - content.append(f"{i18n_language}: {language.capitalize()}") - book_rating = _text(item, './/span[contains(@class, "book-rating-interest-score")]') - if book_rating and float(book_rating): - content.append(f"{i18n_book_rating}: {book_rating}") - file_quality = _text(item, './/span[contains(@class, "book-rating-quality-score")]') - if file_quality and float(file_quality): - content.append(f"{i18n_file_quality}: {file_quality}") - result["content"] = " | ".join(content) - - return result - - -def fetch_traits(engine_traits: EngineTraits) -> None: - """Fetch languages and other search arguments from zlibrary's search form.""" - # pylint: disable=import-outside-toplevel - - import babel - from searx.network import get # see https://github.com/searxng/searxng/issues/762 - from searx.locales import language_tag - - engine_traits.all_locale = "" - engine_traits.custom["ext"] = [] - engine_traits.custom["year_from"] = [] - engine_traits.custom["year_to"] = [] - - resp = get(base_url) - if not resp.ok: # type: ignore - raise RuntimeError("Response from zlibrary's search page is not OK.") - dom = html.fromstring(resp.text) # type: ignore - - for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"): - engine_traits.custom["year_from"].append(year.get("value")) - - for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearTo']/option"): - engine_traits.custom["year_to"].append(year.get("value")) - - for ext in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_extensions']/option"): - value: Optional[str] = ext.get("value") - if value is None: - value = "" - engine_traits.custom["ext"].append(value) - - # Handle languages - # Z-library uses English names for languages, so we need to map them to their respective locales - language_name_locale_map: Dict[str, babel.Locale] = {} - for locale in babel.core.localedata.locale_identifiers(): # type: ignore - # Create a Locale object for the current locale - loc = babel.Locale.parse(locale) - language_name_locale_map[loc.english_name.lower()] = loc # type: ignore - - for x in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_languages']/option"): - eng_lang = x.get("value") - if eng_lang is None: - continue - try: - locale = language_name_locale_map[eng_lang.lower()] - except KeyError: - # silently ignore unknown languages - # print("ERROR: %s is unknown by babel" % (eng_lang)) - continue - sxng_lang = language_tag(locale) - conflict = engine_traits.languages.get(sxng_lang) - if conflict: - if conflict != eng_lang: - print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) - continue - engine_traits.languages[sxng_lang] = eng_lang diff --git a/apps/searxng/searx/exceptions.py b/apps/searxng/searx/exceptions.py deleted file mode 100755 index 069be90..0000000 --- a/apps/searxng/searx/exceptions.py +++ /dev/null @@ -1,118 +0,0 @@ -# -*- coding: utf-8 -*- -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Exception types raised by SearXNG modules. -""" - -from typing import Optional, Union - - -class SearxException(Exception): - """Base SearXNG exception.""" - - -class SearxParameterException(SearxException): - """Raised when query miss a required paramater""" - - def __init__(self, name, value): - if value == '' or value is None: - message = 'Empty ' + name + ' parameter' - else: - message = 'Invalid value "' + value + '" for parameter ' + name - super().__init__(message) - self.message = message - self.parameter_name = name - self.parameter_value = value - - -class SearxSettingsException(SearxException): - """Error while loading the settings""" - - def __init__(self, message: Union[str, Exception], filename: Optional[str]): - super().__init__(message) - self.message = message - self.filename = filename - - -class SearxEngineException(SearxException): - """Error inside an engine""" - - -class SearxXPathSyntaxException(SearxEngineException): - """Syntax error in a XPATH""" - - def __init__(self, xpath_spec, message): - super().__init__(str(xpath_spec) + " " + message) - self.message = message - # str(xpath_spec) to deal with str and XPath instance - self.xpath_str = str(xpath_spec) - - -class SearxEngineResponseException(SearxEngineException): - """Impossible to parse the result of an engine""" - - -class SearxEngineAPIException(SearxEngineResponseException): - """The website has returned an application error""" - - -class SearxEngineAccessDeniedException(SearxEngineResponseException): - """The website is blocking the access""" - - SUSPEND_TIME_SETTING = "search.suspended_times.SearxEngineAccessDenied" - """This settings contains the default suspended time (default 86400 sec / 1 - day).""" - - def __init__(self, suspended_time: int = None, message: str = 'Access denied'): - """Generic exception to raise when an engine denies access to the results. - - :param suspended_time: How long the engine is going to be suspended in - second. Defaults to None. - :type suspended_time: int, None - :param message: Internal message. Defaults to ``Access denied`` - :type message: str - """ - suspended_time = suspended_time or self._get_default_suspended_time() - super().__init__(message + ', suspended_time=' + str(suspended_time)) - self.suspended_time = suspended_time - self.message = message - - def _get_default_suspended_time(self): - from searx import get_setting # pylint: disable=C0415 - - return get_setting(self.SUSPEND_TIME_SETTING) - - -class SearxEngineCaptchaException(SearxEngineAccessDeniedException): - """The website has returned a CAPTCHA.""" - - SUSPEND_TIME_SETTING = "search.suspended_times.SearxEngineCaptcha" - """This settings contains the default suspended time (default 86400 sec / 1 - day).""" - - def __init__(self, suspended_time=None, message='CAPTCHA'): - super().__init__(message=message, suspended_time=suspended_time) - - -class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException): - """The website has returned a Too Many Request status code - - By default, searx stops sending requests to this engine for 1 hour. - """ - - SUSPEND_TIME_SETTING = "search.suspended_times.SearxEngineTooManyRequests" - """This settings contains the default suspended time (default 3660 sec / 1 - hour).""" - - def __init__(self, suspended_time=None, message='Too many request'): - super().__init__(message=message, suspended_time=suspended_time) - - -class SearxEngineXPathException(SearxEngineResponseException): - """Error while getting the result of an XPath expression""" - - def __init__(self, xpath_spec, message): - super().__init__(str(xpath_spec) + " " + message) - self.message = message - # str(xpath_spec) to deal with str and XPath instance - self.xpath_str = str(xpath_spec) diff --git a/apps/searxng/searx/external_bang.py b/apps/searxng/searx/external_bang.py deleted file mode 100755 index 0336d88..0000000 --- a/apps/searxng/searx/external_bang.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -from urllib.parse import quote_plus, urlparse -from searx.data import EXTERNAL_BANGS - -LEAF_KEY = chr(16) - - -def get_node(external_bangs_db, bang): - node = external_bangs_db['trie'] - after = '' - before = '' - for bang_letter in bang: - after += bang_letter - if after in node and isinstance(node, dict): - node = node[after] - before += after - after = '' - return node, before, after - - -def get_bang_definition_and_ac(external_bangs_db, bang): - node, before, after = get_node(external_bangs_db, bang) - - bang_definition = None - bang_ac_list = [] - if after != '': - for k in node: - if k.startswith(after): - bang_ac_list.append(before + k) - elif isinstance(node, dict): - bang_definition = node.get(LEAF_KEY) - bang_ac_list = [before + k for k in node.keys() if k != LEAF_KEY] - elif isinstance(node, str): - bang_definition = node - bang_ac_list = [] - - return bang_definition, bang_ac_list - - -def resolve_bang_definition(bang_definition, query): - url, rank = bang_definition.split(chr(1)) - if url.startswith('//'): - url = 'https:' + url - if query: - url = url.replace(chr(2), quote_plus(query)) - else: - # go to main instead of search page - o = urlparse(url) - url = o.scheme + '://' + o.netloc - - rank = int(rank) if len(rank) > 0 else 0 - return (url, rank) - - -def get_bang_definition_and_autocomplete(bang, external_bangs_db=None): - if external_bangs_db is None: - external_bangs_db = EXTERNAL_BANGS - - bang_definition, bang_ac_list = get_bang_definition_and_ac(external_bangs_db, bang) - - new_autocomplete = [] - current = [*bang_ac_list] - done = set() - while len(current) > 0: - bang_ac = current.pop(0) - done.add(bang_ac) - - current_bang_definition, current_bang_ac_list = get_bang_definition_and_ac(external_bangs_db, bang_ac) - if current_bang_definition: - _, order = resolve_bang_definition(current_bang_definition, '') - new_autocomplete.append((bang_ac, order)) - for new_bang in current_bang_ac_list: - if new_bang not in done and new_bang not in current: - current.append(new_bang) - - new_autocomplete.sort(key=lambda t: (-t[1], t[0])) - new_autocomplete = list(map(lambda t: t[0], new_autocomplete)) - - return bang_definition, new_autocomplete - - -def get_bang_url(search_query, external_bangs_db=None): - """ - Redirects if the user supplied a correct bang search. - :param search_query: This is a search_query object which contains preferences and the submitted queries. - :return: None if the bang was invalid, else a string of the redirect url. - """ - ret_val = None - - if external_bangs_db is None: - external_bangs_db = EXTERNAL_BANGS - - if search_query.external_bang: - bang_definition, _ = get_bang_definition_and_ac(external_bangs_db, search_query.external_bang) - if bang_definition and isinstance(bang_definition, str): - ret_val = resolve_bang_definition(bang_definition, search_query.query)[0] - - return ret_val diff --git a/apps/searxng/searx/external_urls.py b/apps/searxng/searx/external_urls.py deleted file mode 100755 index 7844b58..0000000 --- a/apps/searxng/searx/external_urls.py +++ /dev/null @@ -1,91 +0,0 @@ -import math - -from searx.data import EXTERNAL_URLS - - -IMDB_PREFIX_TO_URL_ID = { - 'tt': 'imdb_title', - 'mn': 'imdb_name', - 'ch': 'imdb_character', - 'co': 'imdb_company', - 'ev': 'imdb_event', -} -HTTP_WIKIMEDIA_IMAGE = 'http://commons.wikimedia.org/wiki/Special:FilePath/' - - -def get_imdb_url_id(imdb_item_id): - id_prefix = imdb_item_id[:2] - return IMDB_PREFIX_TO_URL_ID.get(id_prefix) - - -def get_wikimedia_image_id(url): - if url.startswith(HTTP_WIKIMEDIA_IMAGE): - return url[len(HTTP_WIKIMEDIA_IMAGE) :] - if url.startswith('File:'): - return url[len('File:') :] - return url - - -def get_external_url(url_id, item_id, alternative="default"): - """Return an external URL or None if url_id is not found. - - url_id can take value from data/external_urls.json - The "imdb_id" value is automatically converted according to the item_id value. - - If item_id is None, the raw URL with the $1 is returned. - """ - if item_id is not None: - if url_id == 'imdb_id': - url_id = get_imdb_url_id(item_id) - elif url_id == 'wikimedia_image': - item_id = get_wikimedia_image_id(item_id) - - url_description = EXTERNAL_URLS.get(url_id) - if url_description: - url_template = url_description["urls"].get(alternative) - if url_template is not None: - if item_id is not None: - return url_template.replace('$1', item_id) - else: - return url_template - return None - - -def get_earth_coordinates_url(latitude, longitude, osm_zoom, alternative='default'): - url = ( - get_external_url('map', None, alternative) - .replace('${latitude}', str(latitude)) - .replace('${longitude}', str(longitude)) - .replace('${zoom}', str(osm_zoom)) - ) - return url - - -def area_to_osm_zoom(area): - """Convert an area in km² into an OSM zoom. Less reliable if the shape is not round. - - logarithm regression using these data: - * 9596961 -> 4 (China) - * 3287263 -> 5 (India) - * 643801 -> 6 (France) - * 6028 -> 9 - * 1214 -> 10 - * 891 -> 12 - * 12 -> 13 - - In WolframAlpha: - >>> log fit {9596961,15},{3287263, 14},{643801,13},{6028,10},{1214,9},{891,7},{12,6} - - with 15 = 19-4 (China); 14 = 19-5 (India) and so on - - Args: - area (int,float,str): area in km² - - Returns: - int: OSM zoom or 19 in area is not a number - """ - try: - amount = float(area) - return max(0, min(19, round(19 - 0.688297 * math.log(226.878 * amount)))) - except ValueError: - return 19 diff --git a/apps/searxng/searx/flaskfix.py b/apps/searxng/searx/flaskfix.py deleted file mode 100755 index 326c4b9..0000000 --- a/apps/searxng/searx/flaskfix.py +++ /dev/null @@ -1,78 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring - -from urllib.parse import urlparse - -from werkzeug.middleware.proxy_fix import ProxyFix -from werkzeug.serving import WSGIRequestHandler - -from searx import settings - - -class ReverseProxyPathFix: - '''Wrap the application in this middleware and configure the - front-end server to add these headers, to let you quietly bind - this to a URL other than / and to an HTTP scheme that is - different than what is used locally. - - http://flask.pocoo.org/snippets/35/ - - In nginx: - location /myprefix { - proxy_pass http://127.0.0.1:8000; - proxy_set_header Host $host; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Scheme $scheme; - proxy_set_header X-Script-Name /myprefix; - } - - :param wsgi_app: the WSGI application - ''' - - # pylint: disable=too-few-public-methods - - def __init__(self, wsgi_app): - - self.wsgi_app = wsgi_app - self.script_name = None - self.scheme = None - self.server = None - - if settings['server']['base_url']: - - # If base_url is specified, then these values from are given - # preference over any Flask's generics. - - base_url = urlparse(settings['server']['base_url']) - self.script_name = base_url.path - if self.script_name.endswith('/'): - # remove trailing slash to avoid infinite redirect on the index - # see https://github.com/searx/searx/issues/2729 - self.script_name = self.script_name[:-1] - self.scheme = base_url.scheme - self.server = base_url.netloc - - def __call__(self, environ, start_response): - script_name = self.script_name or environ.get('HTTP_X_SCRIPT_NAME', '') - if script_name: - environ['SCRIPT_NAME'] = script_name - path_info = environ['PATH_INFO'] - if path_info.startswith(script_name): - environ['PATH_INFO'] = path_info[len(script_name) :] - - scheme = self.scheme or environ.get('HTTP_X_SCHEME', '') - if scheme: - environ['wsgi.url_scheme'] = scheme - - server = self.server or environ.get('HTTP_X_FORWARDED_HOST', '') - if server: - environ['HTTP_HOST'] = server - return self.wsgi_app(environ, start_response) - - -def patch_application(app): - # serve pages with HTTP/1.1 - WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server']['http_protocol_version']) - # patch app to handle non root url-s behind proxy & wsgi - app.wsgi_app = ReverseProxyPathFix(ProxyFix(app.wsgi_app)) diff --git a/apps/searxng/searx/infopage/__init__.py b/apps/searxng/searx/infopage/__init__.py deleted file mode 100755 index 6b8fd91..0000000 --- a/apps/searxng/searx/infopage/__init__.py +++ /dev/null @@ -1,187 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pyright: basic -"""Render SearXNG instance documentation. - -Usage in a Flask app route: - -.. code:: python - - from searx import infopage - - _INFO_PAGES = infopage.InfoPageSet(infopage.MistletoePage) - - @app.route('/info/', methods=['GET']) - def info(pagename): - - locale = request.preferences.get_value('locale') - page = _INFO_PAGES.get_page(pagename, locale) - -""" - -__all__ = ['InfoPage', 'InfoPageSet'] - -import os -import os.path -import logging -import typing - -import urllib.parse -import jinja2 -from flask.helpers import url_for -from markdown_it import MarkdownIt - -from .. import get_setting -from ..compat import cached_property -from ..version import GIT_URL -from ..locales import LOCALE_NAMES - - -logger = logging.getLogger('searx.infopage') -_INFO_FOLDER = os.path.abspath(os.path.dirname(__file__)) - - -class InfoPage: - """A page of the :py:obj:`online documentation `.""" - - def __init__(self, fname): - self.fname = fname - - @cached_property - def raw_content(self): - """Raw content of the page (without any jinja rendering)""" - with open(self.fname, 'r', encoding='utf-8') as f: - return f.read() - - @cached_property - def content(self): - """Content of the page (rendered in a Jinja conntext)""" - ctx = self.get_ctx() - template = jinja2.Environment().from_string(self.raw_content) - return template.render(**ctx) - - @cached_property - def title(self): - """Title of the content (without any markup)""" - t = "" - for l in self.raw_content.split('\n'): - if l.startswith('# '): - t = l.strip('# ') - return t - - @cached_property - def html(self): - """Render Markdown (CommonMark_) to HTML by using markdown-it-py_. - - .. _CommonMark: https://commonmark.org/ - .. _markdown-it-py: https://github.com/executablebooks/markdown-it-py - - """ - return ( - MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(self.content) - ) - - def get_ctx(self): - """Jinja context to render :py:obj:`InfoPage.content`""" - - def _md_link(name, url): - url = url_for(url, _external=True) - return "[%s](%s)" % (name, url) - - def _md_search(query): - url = '%s?q=%s' % (url_for('search', _external=True), urllib.parse.quote(query)) - return '[%s](%s)' % (query, url) - - ctx = {} - ctx['GIT_URL'] = GIT_URL - ctx['get_setting'] = get_setting - ctx['link'] = _md_link - ctx['search'] = _md_search - - return ctx - - def __repr__(self): - return f'<{self.__class__.__name__} fname={self.fname!r}>' - - -class InfoPageSet: # pylint: disable=too-few-public-methods - """Cached rendering of the online documentation a SearXNG instance has. - - :param page_class: render online documentation by :py:obj:`InfoPage` parser. - :type page_class: :py:obj:`InfoPage` - - :param info_folder: information directory - :type info_folder: str - """ - - def __init__( - self, page_class: typing.Optional[typing.Type[InfoPage]] = None, info_folder: typing.Optional[str] = None - ): - self.page_class = page_class or InfoPage - self.folder: str = info_folder or _INFO_FOLDER - """location of the Markdwon files""" - - self.CACHE: typing.Dict[tuple, typing.Optional[InfoPage]] = {} - - self.locale_default: str = 'en' - """default language""" - - self.locales: typing.List[str] = [ - locale.replace('_', '-') for locale in os.listdir(_INFO_FOLDER) if locale.replace('_', '-') in LOCALE_NAMES - ] - """list of supported languages (aka locales)""" - - self.toc: typing.List[str] = [ - 'search-syntax', - 'about', - 'donate', - ] - """list of articles in the online documentation""" - - def get_page(self, pagename: str, locale: typing.Optional[str] = None): - """Return ``pagename`` instance of :py:obj:`InfoPage` - - :param pagename: name of the page, a value from :py:obj:`InfoPageSet.toc` - :type pagename: str - - :param locale: language of the page, e.g. ``en``, ``zh_Hans_CN`` - (default: :py:obj:`InfoPageSet.i18n_origin`) - :type locale: str - - """ - locale = locale or self.locale_default - - if pagename not in self.toc: - return None - if locale not in self.locales: - return None - - cache_key = (pagename, locale) - - if cache_key in self.CACHE: - return self.CACHE[cache_key] - - # not yet instantiated - - fname = os.path.join(self.folder, locale.replace('-', '_'), pagename) + '.md' - if not os.path.exists(fname): - logger.info('file %s does not exists', fname) - self.CACHE[cache_key] = None - return None - - page = self.page_class(fname) - self.CACHE[cache_key] = page - return page - - def iter_pages(self, locale: typing.Optional[str] = None, fallback_to_default=False): - """Iterate over all pages of the TOC""" - locale = locale or self.locale_default - for page_name in self.toc: - page_locale = locale - page = self.get_page(page_name, locale) - if fallback_to_default and page is None: - page_locale = self.locale_default - page = self.get_page(page_name, self.locale_default) - if page is not None: - # page is None if the page was deleted by the administrator - yield page_name, page_locale, page diff --git a/apps/searxng/searx/locales.py b/apps/searxng/searx/locales.py deleted file mode 100755 index 12066f3..0000000 --- a/apps/searxng/searx/locales.py +++ /dev/null @@ -1,471 +0,0 @@ -# -*- coding: utf-8 -*- -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`. -""" - -from typing import Set, Optional, List -import os -import pathlib - -import babel -from babel.support import Translations -import babel.languages -import babel.core -import flask_babel -import flask -from flask.ctx import has_request_context -from searx import logger - -logger = logger.getChild('locales') - - -# safe before monkey patching flask_babel.get_translations -_flask_babel_get_translations = flask_babel.get_translations - -LOCALE_NAMES = {} -"""Mapping of locales and their description. Locales e.g. 'fr' or 'pt-BR' (see -:py:obj:`locales_initialize`). - -:meta hide-value: -""" - -RTL_LOCALES: Set[str] = set() -"""List of *Right-To-Left* locales e.g. 'he' or 'fa-IR' (see -:py:obj:`locales_initialize`).""" - -ADDITIONAL_TRANSLATIONS = { - "dv": "ދިވެހި (Dhivehi)", - "oc": "Occitan", - "szl": "Ślōnski (Silesian)", - "pap": "Papiamento", -} -"""Additional languages SearXNG has translations for but not supported by -python-babel (see :py:obj:`locales_initialize`).""" - -LOCALE_BEST_MATCH = { - "dv": "si", - "oc": 'fr-FR', - "szl": "pl", - "nl-BE": "nl", - "zh-HK": "zh-Hant-TW", - "pap": "pt-BR", -} -"""Map a locale we do not have a translations for to a locale we have a -translation for. By example: use Taiwan version of the translation for Hong -Kong.""" - - -def localeselector(): - locale = 'en' - if has_request_context(): - value = flask.request.preferences.get_value('locale') - if value: - locale = value - - # first, set the language that is not supported by babel - if locale in ADDITIONAL_TRANSLATIONS: - flask.request.form['use-translation'] = locale - - # second, map locale to a value python-babel supports - locale = LOCALE_BEST_MATCH.get(locale, locale) - - if locale == '': - # if there is an error loading the preferences - # the locale is going to be '' - locale = 'en' - - # babel uses underscore instead of hyphen. - locale = locale.replace('-', '_') - return locale - - -def get_translations(): - """Monkey patch of :py:obj:`flask_babel.get_translations`""" - if has_request_context(): - use_translation = flask.request.form.get('use-translation') - if use_translation in ADDITIONAL_TRANSLATIONS: - babel_ext = flask_babel.current_app.extensions['babel'] - return Translations.load(babel_ext.translation_directories[0], use_translation) - return _flask_babel_get_translations() - - -def get_locale_descr(locale, locale_name): - """Get locale name e.g. 'Français - fr' or 'Português (Brasil) - pt-BR' - - :param locale: instance of :py:class:`Locale` - :param locale_name: name e.g. 'fr' or 'pt_BR' (delimiter is *underscore*) - """ - - native_language, native_territory = _get_locale_descr(locale, locale_name) - english_language, english_territory = _get_locale_descr(locale, 'en') - - if native_territory == english_territory: - english_territory = None - - if not native_territory and not english_territory: - if native_language == english_language: - return native_language - return native_language + ' (' + english_language + ')' - - result = native_language + ', ' + native_territory + ' (' + english_language - if english_territory: - return result + ', ' + english_territory + ')' - return result + ')' - - -def _get_locale_descr(locale, language_code): - language_name = locale.get_language_name(language_code).capitalize() - if language_name and ('a' <= language_name[0] <= 'z'): - language_name = language_name.capitalize() - terrirtory_name = locale.get_territory_name(language_code) - return language_name, terrirtory_name - - -def locales_initialize(directory=None): - """Initialize locales environment of the SearXNG session. - - - monkey patch :py:obj:`flask_babel.get_translations` by :py:obj:`get_translations` - - init global names :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES` - """ - - directory = directory or pathlib.Path(__file__).parent / 'translations' - logger.debug("locales_initialize: %s", directory) - flask_babel.get_translations = get_translations - - for tag, descr in ADDITIONAL_TRANSLATIONS.items(): - locale = babel.Locale.parse(LOCALE_BEST_MATCH[tag], sep='-') - LOCALE_NAMES[tag] = descr - if locale.text_direction == 'rtl': - RTL_LOCALES.add(tag) - - for tag in LOCALE_BEST_MATCH: - descr = LOCALE_NAMES.get(tag) - if not descr: - locale = babel.Locale.parse(tag, sep='-') - LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_')) - if locale.text_direction == 'rtl': - RTL_LOCALES.add(tag) - - for dirname in sorted(os.listdir(directory)): - # Based on https://flask-babel.tkte.ch/_modules/flask_babel.html#Babel.list_translations - if not os.path.isdir(os.path.join(directory, dirname, 'LC_MESSAGES')): - continue - tag = dirname.replace('_', '-') - descr = LOCALE_NAMES.get(tag) - if not descr: - locale = babel.Locale.parse(dirname) - LOCALE_NAMES[tag] = get_locale_descr(locale, dirname) - if locale.text_direction == 'rtl': - RTL_LOCALES.add(tag) - - -def region_tag(locale: babel.Locale) -> str: - """Returns SearXNG's region tag from the locale (e.g. zh-TW , en-US).""" - if not locale.territory: - raise ValueError('%s missed a territory') - return locale.language + '-' + locale.territory - - -def language_tag(locale: babel.Locale) -> str: - """Returns SearXNG's language tag from the locale and if exits, the tag - includes the script name (e.g. en, zh_Hant). - """ - sxng_lang = locale.language - if locale.script: - sxng_lang += '_' + locale.script - return sxng_lang - - -def get_locale(locale_tag: str) -> Optional[babel.Locale]: - """Returns a :py:obj:`babel.Locale` object parsed from argument - ``locale_tag``""" - try: - locale = babel.Locale.parse(locale_tag, sep='-') - return locale - - except babel.core.UnknownLocaleError: - return None - - -def get_offical_locales( - territory: str, languages=None, regional: bool = False, de_facto: bool = True -) -> Set[babel.Locale]: - """Returns a list of :py:obj:`babel.Locale` with languages from - :py:obj:`babel.languages.get_official_languages`. - - :param territory: The territory (country or region) code. - - :param languages: A list of language codes the languages from - :py:obj:`babel.languages.get_official_languages` should be in - (intersection). If this argument is ``None``, all official languages in - this territory are used. - - :param regional: If the regional flag is set, then languages which are - regionally official are also returned. - - :param de_facto: If the de_facto flag is set to `False`, then languages - which are “de facto” official are not returned. - - """ - ret_val = set() - o_languages = babel.languages.get_official_languages(territory, regional=regional, de_facto=de_facto) - - if languages: - languages = [l.lower() for l in languages] - o_languages = set(l for l in o_languages if l.lower() in languages) - - for lang in o_languages: - try: - locale = babel.Locale.parse(lang + '_' + territory) - ret_val.add(locale) - except babel.UnknownLocaleError: - continue - - return ret_val - - -def get_engine_locale(searxng_locale, engine_locales, default=None): - """Return engine's language (aka locale) string that best fits to argument - ``searxng_locale``. - - Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to - corresponding *engine locales*:: - - : { - # SearXNG string : engine-string - 'ca-ES' : 'ca_ES', - 'fr-BE' : 'fr_BE', - 'fr-CA' : 'fr_CA', - 'fr-CH' : 'fr_CH', - 'fr' : 'fr_FR', - ... - 'pl-PL' : 'pl_PL', - 'pt-PT' : 'pt_PT' - .. - 'zh' : 'zh' - 'zh_Hans' : 'zh' - 'zh_Hant' : 'zh_TW' - } - - .. hint:: - - The *SearXNG locale* string has to be known by babel! - - If there is no direct 1:1 mapping, this functions tries to narrow down - engine's language (locale). If no value can be determined by these - approximation attempts the ``default`` value is returned. - - Assumptions: - - A. When user select a language the results should be optimized according to - the selected language. - - B. When user select a language and a territory the results should be - optimized with first priority on terrirtory and second on language. - - First approximation rule (*by territory*): - - When the user selects a locale with terrirtory (and a language), the - territory has priority over the language. If any of the offical languages - in the terrirtory is supported by the engine (``engine_locales``) it will - be used. - - Second approximation rule (*by language*): - - If "First approximation rule" brings no result or the user selects only a - language without a terrirtory. Check in which territories the language - has an offical status and if one of these territories is supported by the - engine. - - """ - # pylint: disable=too-many-branches, too-many-return-statements - - engine_locale = engine_locales.get(searxng_locale) - - if engine_locale is not None: - # There was a 1:1 mapping (e.g. a region "fr-BE --> fr_BE" or a language - # "zh --> zh"), no need to narrow language-script nor territory. - return engine_locale - - try: - locale = babel.Locale.parse(searxng_locale, sep='-') - except babel.core.UnknownLocaleError: - try: - locale = babel.Locale.parse(searxng_locale.split('-')[0]) - except babel.core.UnknownLocaleError: - return default - - searxng_lang = language_tag(locale) - engine_locale = engine_locales.get(searxng_lang) - if engine_locale is not None: - # There was a 1:1 mapping (e.g. "zh-HK --> zh_Hant" or "zh-CN --> zh_Hans") - return engine_locale - - # SearXNG's selected locale is not supported by the engine .. - - if locale.territory: - # Try to narrow by *offical* languages in the territory (??-XX). - - for official_language in babel.languages.get_official_languages(locale.territory, de_facto=True): - searxng_locale = official_language + '-' + locale.territory - engine_locale = engine_locales.get(searxng_locale) - if engine_locale is not None: - return engine_locale - - # Engine does not support one of the offical languages in the territory or - # there is only a language selected without a territory. - - # Now lets have a look if the searxng_lang (the language selected by the - # user) is a offical language in other territories. If so, check if - # engine does support the searxng_lang in this other territory. - - if locale.language: - - terr_lang_dict = {} - for territory, langs in babel.core.get_global("territory_languages").items(): - if not langs.get(searxng_lang, {}).get('official_status'): - continue - terr_lang_dict[territory] = langs.get(searxng_lang) - - # first: check fr-FR, de-DE .. is supported by the engine - # exception: 'en' --> 'en-US' - - territory = locale.language.upper() - if territory == 'EN': - territory = 'US' - - if terr_lang_dict.get(territory): - searxng_locale = locale.language + '-' + territory - engine_locale = engine_locales.get(searxng_locale) - if engine_locale is not None: - return engine_locale - - # second: sort by population_percent and take first match - - # drawback of "population percent": if there is a terrirtory with a - # small number of people (e.g 100) but the majority speaks the - # language, then the percentage migth be 100% (--> 100 people) but in - # a different terrirtory with more people (e.g. 10.000) where only 10% - # speak the language the total amount of speaker is higher (--> 200 - # people). - # - # By example: The population of Saint-Martin is 33.000, of which 100% - # speak French, but this is less than the 30% of the approximately 2.5 - # million Belgian citizens - # - # - 'fr-MF', 'population_percent': 100.0, 'official_status': 'official' - # - 'fr-BE', 'population_percent': 38.0, 'official_status': 'official' - - terr_lang_list = [] - for k, v in terr_lang_dict.items(): - terr_lang_list.append((k, v)) - - for territory, _lang in sorted(terr_lang_list, key=lambda item: item[1]['population_percent'], reverse=True): - searxng_locale = locale.language + '-' + territory - engine_locale = engine_locales.get(searxng_locale) - if engine_locale is not None: - return engine_locale - - # No luck: narrow by "language from territory" and "territory from language" - # does not fit to a locale supported by the engine. - - if engine_locale is None: - engine_locale = default - - return default - - -def match_locale(searxng_locale: str, locale_tag_list: List[str], fallback: Optional[str] = None) -> Optional[str]: - """Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``. - - :param str searxng_locale: SearXNG's internal representation of locale (de, - de-DE, fr-BE, zh, zh-CN, zh-TW ..). - - :param list locale_tag_list: The list of locale tags to select from - - :param str fallback: fallback locale tag (if unset --> ``None``) - - The rules to find a match are implemented in :py:obj:`get_engine_locale`, - the ``engine_locales`` is build up by :py:obj:`build_engine_locales`. - - .. hint:: - - The *SearXNG locale* string and the members of ``locale_tag_list`` has to - be known by babel! The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the - UI and are not known by babel --> will be ignored. - """ - - # searxng_locale = 'es' - # locale_tag_list = ['es-AR', 'es-ES', 'es-MX'] - - if not searxng_locale: - return fallback - - locale = get_locale(searxng_locale) - if locale is None: - return fallback - - # normalize to a SearXNG locale that can be passed to get_engine_locale - - searxng_locale = language_tag(locale) - if locale.territory: - searxng_locale = region_tag(locale) - - # clean up locale_tag_list - - tag_list = [] - for tag in locale_tag_list: - if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS: - continue - tag_list.append(tag) - - # emulate fetch_traits - engine_locales = build_engine_locales(tag_list) - return get_engine_locale(searxng_locale, engine_locales, default=fallback) - - -def build_engine_locales(tag_list: List[str]): - """From a list of locale tags a dictionary is build that can be passed by - argument ``engine_locales`` to :py:obj:`get_engine_locale`. This function - is mainly used by :py:obj:`match_locale` and is similar to what the - ``fetch_traits(..)`` function of engines do. - - If there are territory codes in the ``tag_list`` that have a *script code* - additional keys are added to the returned dictionary. - - .. code:: python - - >>> import locales - >>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW']) - >>> engine_locales - { - 'en': 'en', 'en-US': 'en-US', - 'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN', - 'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW' - } - >>> get_engine_locale('zh-Hans', engine_locales) - 'zh-CN' - - This function is a good example to understand the language/region model - of SearXNG: - - SearXNG only distinguishes between **search languages** and **search - regions**, by adding the *script-tags*, languages with *script-tags* can - be assigned to the **regions** that SearXNG supports. - - """ - engine_locales = {} - - for tag in tag_list: - locale = get_locale(tag) - if locale is None: - logger.warning("build_engine_locales: skip locale tag %s / unknown by babel", tag) - continue - if locale.territory: - engine_locales[region_tag(locale)] = tag - if locale.script: - engine_locales[language_tag(locale)] = tag - else: - engine_locales[language_tag(locale)] = tag - return engine_locales diff --git a/apps/searxng/searx/metrics/__init__.py b/apps/searxng/searx/metrics/__init__.py deleted file mode 100755 index 18d2170..0000000 --- a/apps/searxng/searx/metrics/__init__.py +++ /dev/null @@ -1,248 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring - -import typing -import math -import contextlib -from timeit import default_timer -from operator import itemgetter - -from searx.engines import engines -from .models import HistogramStorage, CounterStorage, VoidHistogram, VoidCounterStorage -from .error_recorder import count_error, count_exception, errors_per_engines - -__all__ = [ - "initialize", - "get_engines_stats", - "get_engine_errors", - "histogram", - "histogram_observe", - "histogram_observe_time", - "counter", - "counter_inc", - "counter_add", - "count_error", - "count_exception", -] - - -ENDPOINTS = {'search'} - - -histogram_storage: typing.Optional[HistogramStorage] = None -counter_storage: typing.Optional[CounterStorage] = None - - -@contextlib.contextmanager -def histogram_observe_time(*args): - h = histogram_storage.get(*args) - before = default_timer() - yield before - duration = default_timer() - before - if h: - h.observe(duration) - else: - raise ValueError("histogram " + repr((*args,)) + " doesn't not exist") - - -def histogram_observe(duration, *args): - histogram_storage.get(*args).observe(duration) - - -def histogram(*args, raise_on_not_found=True): - h = histogram_storage.get(*args) - if raise_on_not_found and h is None: - raise ValueError("histogram " + repr((*args,)) + " doesn't not exist") - return h - - -def counter_inc(*args): - counter_storage.add(1, *args) - - -def counter_add(value, *args): - counter_storage.add(value, *args) - - -def counter(*args): - return counter_storage.get(*args) - - -def initialize(engine_names=None, enabled=True): - """ - Initialize metrics - """ - global counter_storage, histogram_storage # pylint: disable=global-statement - - if enabled: - counter_storage = CounterStorage() - histogram_storage = HistogramStorage() - else: - counter_storage = VoidCounterStorage() - histogram_storage = HistogramStorage(histogram_class=VoidHistogram) - - # max_timeout = max of all the engine.timeout - max_timeout = 2 - for engine_name in engine_names or engines: - if engine_name in engines: - max_timeout = max(max_timeout, engines[engine_name].timeout) - - # histogram configuration - histogram_width = 0.1 - histogram_size = int(1.5 * max_timeout / histogram_width) - - # engines - for engine_name in engine_names or engines: - # search count - counter_storage.configure('engine', engine_name, 'search', 'count', 'sent') - counter_storage.configure('engine', engine_name, 'search', 'count', 'successful') - # global counter of errors - counter_storage.configure('engine', engine_name, 'search', 'count', 'error') - # score of the engine - counter_storage.configure('engine', engine_name, 'score') - # result count per requests - histogram_storage.configure(1, 100, 'engine', engine_name, 'result', 'count') - # time doing HTTP requests - histogram_storage.configure(histogram_width, histogram_size, 'engine', engine_name, 'time', 'http') - # total time - # .time.request and ...response times may overlap .time.http time. - histogram_storage.configure(histogram_width, histogram_size, 'engine', engine_name, 'time', 'total') - - -def get_engine_errors(engline_name_list): - result = {} - engine_names = list(errors_per_engines.keys()) - engine_names.sort() - for engine_name in engine_names: - if engine_name not in engline_name_list: - continue - - error_stats = errors_per_engines[engine_name] - sent_search_count = max(counter('engine', engine_name, 'search', 'count', 'sent'), 1) - sorted_context_count_list = sorted(error_stats.items(), key=lambda context_count: context_count[1]) - r = [] - for context, count in sorted_context_count_list: - percentage = round(20 * count / sent_search_count) * 5 - r.append( - { - 'filename': context.filename, - 'function': context.function, - 'line_no': context.line_no, - 'code': context.code, - 'exception_classname': context.exception_classname, - 'log_message': context.log_message, - 'log_parameters': context.log_parameters, - 'secondary': context.secondary, - 'percentage': percentage, - } - ) - result[engine_name] = sorted(r, reverse=True, key=lambda d: d['percentage']) - return result - - -def get_reliabilities(engline_name_list, checker_results): - reliabilities = {} - - engine_errors = get_engine_errors(engline_name_list) - - for engine_name in engline_name_list: - checker_result = checker_results.get(engine_name, {}) - checker_success = checker_result.get('success', True) - errors = engine_errors.get(engine_name) or [] - if counter('engine', engine_name, 'search', 'count', 'sent') == 0: - # no request - reliablity = None - elif checker_success and not errors: - reliablity = 100 - elif 'simple' in checker_result.get('errors', {}): - # the basic (simple) test doesn't work: the engine is broken accoding to the checker - # even if there is no exception - reliablity = 0 - else: - # pylint: disable=consider-using-generator - reliablity = 100 - sum([error['percentage'] for error in errors if not error.get('secondary')]) - - reliabilities[engine_name] = { - 'reliablity': reliablity, - 'errors': errors, - 'checker': checker_results.get(engine_name, {}).get('errors', {}), - } - return reliabilities - - -def get_engines_stats(engine_name_list): - assert counter_storage is not None - assert histogram_storage is not None - - list_time = [] - max_time_total = max_result_count = None - - for engine_name in engine_name_list: - - sent_count = counter('engine', engine_name, 'search', 'count', 'sent') - if sent_count == 0: - continue - - result_count = histogram('engine', engine_name, 'result', 'count').percentage(50) - result_count_sum = histogram('engine', engine_name, 'result', 'count').sum - successful_count = counter('engine', engine_name, 'search', 'count', 'successful') - - time_total = histogram('engine', engine_name, 'time', 'total').percentage(50) - max_time_total = max(time_total or 0, max_time_total or 0) - max_result_count = max(result_count or 0, max_result_count or 0) - - stats = { - 'name': engine_name, - 'total': None, - 'total_p80': None, - 'total_p95': None, - 'http': None, - 'http_p80': None, - 'http_p95': None, - 'processing': None, - 'processing_p80': None, - 'processing_p95': None, - 'score': 0, - 'score_per_result': 0, - 'result_count': result_count, - } - - if successful_count and result_count_sum: - score = counter('engine', engine_name, 'score') - - stats['score'] = score - stats['score_per_result'] = score / float(result_count_sum) - - time_http = histogram('engine', engine_name, 'time', 'http').percentage(50) - time_http_p80 = time_http_p95 = 0 - - if time_http is not None: - - time_http_p80 = histogram('engine', engine_name, 'time', 'http').percentage(80) - time_http_p95 = histogram('engine', engine_name, 'time', 'http').percentage(95) - - stats['http'] = round(time_http, 1) - stats['http_p80'] = round(time_http_p80, 1) - stats['http_p95'] = round(time_http_p95, 1) - - if time_total is not None: - - time_total_p80 = histogram('engine', engine_name, 'time', 'total').percentage(80) - time_total_p95 = histogram('engine', engine_name, 'time', 'total').percentage(95) - - stats['total'] = round(time_total, 1) - stats['total_p80'] = round(time_total_p80, 1) - stats['total_p95'] = round(time_total_p95, 1) - - stats['processing'] = round(time_total - (time_http or 0), 1) - stats['processing_p80'] = round(time_total_p80 - time_http_p80, 1) - stats['processing_p95'] = round(time_total_p95 - time_http_p95, 1) - - list_time.append(stats) - - return { - 'time': list_time, - 'max_time': math.ceil(max_time_total or 0), - 'max_result_count': math.ceil(max_result_count or 0), - } diff --git a/apps/searxng/searx/metrics/error_recorder.py b/apps/searxng/searx/metrics/error_recorder.py deleted file mode 100755 index 1d0d6e7..0000000 --- a/apps/searxng/searx/metrics/error_recorder.py +++ /dev/null @@ -1,190 +0,0 @@ -import typing -import inspect -from json import JSONDecodeError -from urllib.parse import urlparse -from httpx import HTTPError, HTTPStatusError -from searx.exceptions import ( - SearxXPathSyntaxException, - SearxEngineXPathException, - SearxEngineAPIException, - SearxEngineAccessDeniedException, -) -from searx import searx_parent_dir, settings -from searx.engines import engines - - -errors_per_engines = {} - - -class ErrorContext: - - __slots__ = ( - 'filename', - 'function', - 'line_no', - 'code', - 'exception_classname', - 'log_message', - 'log_parameters', - 'secondary', - ) - - def __init__(self, filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary): - self.filename = filename - self.function = function - self.line_no = line_no - self.code = code - self.exception_classname = exception_classname - self.log_message = log_message - self.log_parameters = log_parameters - self.secondary = secondary - - def __eq__(self, o) -> bool: - if not isinstance(o, ErrorContext): - return False - return ( - self.filename == o.filename - and self.function == o.function - and self.line_no == o.line_no - and self.code == o.code - and self.exception_classname == o.exception_classname - and self.log_message == o.log_message - and self.log_parameters == o.log_parameters - and self.secondary == o.secondary - ) - - def __hash__(self): - return hash( - ( - self.filename, - self.function, - self.line_no, - self.code, - self.exception_classname, - self.log_message, - self.log_parameters, - self.secondary, - ) - ) - - def __repr__(self): - return "ErrorContext({!r}, {!r}, {!r}, {!r}, {!r}, {!r}) {!r}".format( - self.filename, - self.line_no, - self.code, - self.exception_classname, - self.log_message, - self.log_parameters, - self.secondary, - ) - - -def add_error_context(engine_name: str, error_context: ErrorContext) -> None: - errors_for_engine = errors_per_engines.setdefault(engine_name, {}) - errors_for_engine[error_context] = errors_for_engine.get(error_context, 0) + 1 - engines[engine_name].logger.warning('%s', str(error_context)) - - -def get_trace(traces): - for trace in reversed(traces): - split_filename = trace.filename.split('/') - if '/'.join(split_filename[-3:-1]) == 'searx/engines': - return trace - if '/'.join(split_filename[-4:-1]) == 'searx/search/processors': - return trace - return traces[-1] - - -def get_hostname(exc: HTTPError) -> typing.Optional[None]: - url = exc.request.url - if url is None and exc.response is not None: - url = exc.response.url - return urlparse(url).netloc - - -def get_request_exception_messages( - exc: HTTPError, -) -> typing.Tuple[typing.Optional[str], typing.Optional[str], typing.Optional[str]]: - url = None - status_code = None - reason = None - hostname = None - if hasattr(exc, '_request') and exc._request is not None: - # exc.request is property that raise an RuntimeException - # if exc._request is not defined. - url = exc.request.url - if url is None and hasattr(exc, 'response') and exc.response is not None: - url = exc.response.url - if url is not None: - hostname = url.host - if isinstance(exc, HTTPStatusError): - status_code = str(exc.response.status_code) - reason = exc.response.reason_phrase - return (status_code, reason, hostname) - - -def get_messages(exc, filename) -> typing.Tuple: - if isinstance(exc, JSONDecodeError): - return (exc.msg,) - if isinstance(exc, TypeError): - return (str(exc),) - if isinstance(exc, ValueError) and 'lxml' in filename: - return (str(exc),) - if isinstance(exc, HTTPError): - return get_request_exception_messages(exc) - if isinstance(exc, SearxXPathSyntaxException): - return (exc.xpath_str, exc.message) - if isinstance(exc, SearxEngineXPathException): - return (exc.xpath_str, exc.message) - if isinstance(exc, SearxEngineAPIException): - return (str(exc.args[0]),) - if isinstance(exc, SearxEngineAccessDeniedException): - return (exc.message,) - return () - - -def get_exception_classname(exc: Exception) -> str: - exc_class = exc.__class__ - exc_name = exc_class.__qualname__ - exc_module = exc_class.__module__ - if exc_module is None or exc_module == str.__class__.__module__: - return exc_name - return exc_module + '.' + exc_name - - -def get_error_context(framerecords, exception_classname, log_message, log_parameters, secondary) -> ErrorContext: - searx_frame = get_trace(framerecords) - filename = searx_frame.filename - if filename.startswith(searx_parent_dir): - filename = filename[len(searx_parent_dir) + 1 :] - function = searx_frame.function - line_no = searx_frame.lineno - code = searx_frame.code_context[0].strip() - del framerecords - return ErrorContext(filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary) - - -def count_exception(engine_name: str, exc: Exception, secondary: bool = False) -> None: - if not settings['general']['enable_metrics']: - return - framerecords = inspect.trace() - try: - exception_classname = get_exception_classname(exc) - log_parameters = get_messages(exc, framerecords[-1][1]) - error_context = get_error_context(framerecords, exception_classname, None, log_parameters, secondary) - add_error_context(engine_name, error_context) - finally: - del framerecords - - -def count_error( - engine_name: str, log_message: str, log_parameters: typing.Optional[typing.Tuple] = None, secondary: bool = False -) -> None: - if not settings['general']['enable_metrics']: - return - framerecords = list(reversed(inspect.stack()[1:])) - try: - error_context = get_error_context(framerecords, None, log_message, log_parameters or (), secondary) - add_error_context(engine_name, error_context) - finally: - del framerecords diff --git a/apps/searxng/searx/metrics/models.py b/apps/searxng/searx/metrics/models.py deleted file mode 100755 index 900a7fa..0000000 --- a/apps/searxng/searx/metrics/models.py +++ /dev/null @@ -1,167 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -import decimal -import threading - -from searx import logger - - -__all__ = ["Histogram", "HistogramStorage", "CounterStorage"] - -logger = logger.getChild('searx.metrics') - - -class Histogram: - - _slots__ = '_lock', '_size', '_sum', '_quartiles', '_count', '_width' - - def __init__(self, width=10, size=200): - self._lock = threading.Lock() - self._width = width - self._size = size - self._quartiles = [0] * size - self._count = 0 - self._sum = 0 - - def observe(self, value): - q = int(value / self._width) - if q < 0: - """Value below zero is ignored""" - q = 0 - if q >= self._size: - """Value above the maximum is replaced by the maximum""" - q = self._size - 1 - with self._lock: - self._quartiles[q] += 1 - self._count += 1 - self._sum += value - - @property - def quartiles(self): - return list(self._quartiles) - - @property - def count(self): - return self._count - - @property - def sum(self): - return self._sum - - @property - def average(self): - with self._lock: - if self._count != 0: - return self._sum / self._count - else: - return 0 - - @property - def quartile_percentage(self): - '''Quartile in percentage''' - with self._lock: - if self._count > 0: - return [int(q * 100 / self._count) for q in self._quartiles] - else: - return self._quartiles - - @property - def quartile_percentage_map(self): - result = {} - # use Decimal to avoid rounding errors - x = decimal.Decimal(0) - width = decimal.Decimal(self._width) - width_exponent = -width.as_tuple().exponent - with self._lock: - if self._count > 0: - for y in self._quartiles: - yp = int(y * 100 / self._count) - if yp != 0: - result[round(float(x), width_exponent)] = yp - x += width - return result - - def percentage(self, percentage): - # use Decimal to avoid rounding errors - x = decimal.Decimal(0) - width = decimal.Decimal(self._width) - stop_at_value = decimal.Decimal(self._count) / 100 * percentage - sum_value = 0 - with self._lock: - if self._count > 0: - for y in self._quartiles: - sum_value += y - if sum_value >= stop_at_value: - return x - x += width - return None - - def __repr__(self): - return "Histogram" - - -class HistogramStorage: - - __slots__ = 'measures', 'histogram_class' - - def __init__(self, histogram_class=Histogram): - self.clear() - self.histogram_class = histogram_class - - def clear(self): - self.measures = {} - - def configure(self, width, size, *args): - measure = self.histogram_class(width, size) - self.measures[args] = measure - return measure - - def get(self, *args): - return self.measures.get(args, None) - - def dump(self): - logger.debug("Histograms:") - ks = sorted(self.measures.keys(), key='/'.join) - for k in ks: - logger.debug("- %-60s %s", '|'.join(k), self.measures[k]) - - -class CounterStorage: - - __slots__ = 'counters', 'lock' - - def __init__(self): - self.lock = threading.Lock() - self.clear() - - def clear(self): - with self.lock: - self.counters = {} - - def configure(self, *args): - with self.lock: - self.counters[args] = 0 - - def get(self, *args): - return self.counters[args] - - def add(self, value, *args): - with self.lock: - self.counters[args] += value - - def dump(self): - with self.lock: - ks = sorted(self.counters.keys(), key='/'.join) - logger.debug("Counters:") - for k in ks: - logger.debug("- %-60s %s", '|'.join(k), self.counters[k]) - - -class VoidHistogram(Histogram): - def observe(self, value): - pass - - -class VoidCounterStorage(CounterStorage): - def add(self, value, *args): - pass diff --git a/apps/searxng/searx/network/__init__.py b/apps/searxng/searx/network/__init__.py deleted file mode 100755 index 8622e97..0000000 --- a/apps/searxng/searx/network/__init__.py +++ /dev/null @@ -1,266 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring, global-statement - -import asyncio -import threading -import concurrent.futures -from queue import SimpleQueue -from types import MethodType -from timeit import default_timer -from typing import Iterable, NamedTuple, Tuple, List, Dict, Union -from contextlib import contextmanager - -import httpx -import anyio - -from .network import get_network, initialize, check_network_configuration # pylint:disable=cyclic-import -from .client import get_loop -from .raise_for_httperror import raise_for_httperror - - -THREADLOCAL = threading.local() -"""Thread-local data is data for thread specific values.""" - - -def reset_time_for_thread(): - THREADLOCAL.total_time = 0 - - -def get_time_for_thread(): - """returns thread's total time or None""" - return THREADLOCAL.__dict__.get('total_time') - - -def set_timeout_for_thread(timeout, start_time=None): - THREADLOCAL.timeout = timeout - THREADLOCAL.start_time = start_time - - -def set_context_network_name(network_name): - THREADLOCAL.network = get_network(network_name) - - -def get_context_network(): - """If set return thread's network. - - If unset, return value from :py:obj:`get_network`. - """ - return THREADLOCAL.__dict__.get('network') or get_network() - - -@contextmanager -def _record_http_time(): - # pylint: disable=too-many-branches - time_before_request = default_timer() - start_time = getattr(THREADLOCAL, 'start_time', time_before_request) - try: - yield start_time - finally: - # update total_time. - # See get_time_for_thread() and reset_time_for_thread() - if hasattr(THREADLOCAL, 'total_time'): - time_after_request = default_timer() - THREADLOCAL.total_time += time_after_request - time_before_request - - -def _get_timeout(start_time, kwargs): - # pylint: disable=too-many-branches - - # timeout (httpx) - if 'timeout' in kwargs: - timeout = kwargs['timeout'] - else: - timeout = getattr(THREADLOCAL, 'timeout', None) - if timeout is not None: - kwargs['timeout'] = timeout - - # 2 minutes timeout for the requests without timeout - timeout = timeout or 120 - - # ajdust actual timeout - timeout += 0.2 # overhead - if start_time: - timeout -= default_timer() - start_time - - return timeout - - -def request(method, url, **kwargs): - """same as requests/requests/api.py request(...)""" - with _record_http_time() as start_time: - network = get_context_network() - timeout = _get_timeout(start_time, kwargs) - future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop()) - try: - return future.result(timeout) - except concurrent.futures.TimeoutError as e: - raise httpx.TimeoutException('Timeout', request=None) from e - - -def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]: - """send multiple HTTP requests in parallel. Wait for all requests to finish.""" - with _record_http_time() as start_time: - # send the requests - network = get_context_network() - loop = get_loop() - future_list = [] - for request_desc in request_list: - timeout = _get_timeout(start_time, request_desc.kwargs) - future = asyncio.run_coroutine_threadsafe( - network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop - ) - future_list.append((future, timeout)) - - # read the responses - responses = [] - for future, timeout in future_list: - try: - responses.append(future.result(timeout)) - except concurrent.futures.TimeoutError: - responses.append(httpx.TimeoutException('Timeout', request=None)) - except Exception as e: # pylint: disable=broad-except - responses.append(e) - return responses - - -class Request(NamedTuple): - """Request description for the multi_requests function""" - - method: str - url: str - kwargs: Dict[str, str] = {} - - @staticmethod - def get(url, **kwargs): - return Request('GET', url, kwargs) - - @staticmethod - def options(url, **kwargs): - return Request('OPTIONS', url, kwargs) - - @staticmethod - def head(url, **kwargs): - return Request('HEAD', url, kwargs) - - @staticmethod - def post(url, **kwargs): - return Request('POST', url, kwargs) - - @staticmethod - def put(url, **kwargs): - return Request('PUT', url, kwargs) - - @staticmethod - def patch(url, **kwargs): - return Request('PATCH', url, kwargs) - - @staticmethod - def delete(url, **kwargs): - return Request('DELETE', url, kwargs) - - -def get(url, **kwargs): - kwargs.setdefault('allow_redirects', True) - return request('get', url, **kwargs) - - -def options(url, **kwargs): - kwargs.setdefault('allow_redirects', True) - return request('options', url, **kwargs) - - -def head(url, **kwargs): - kwargs.setdefault('allow_redirects', False) - return request('head', url, **kwargs) - - -def post(url, data=None, **kwargs): - return request('post', url, data=data, **kwargs) - - -def put(url, data=None, **kwargs): - return request('put', url, data=data, **kwargs) - - -def patch(url, data=None, **kwargs): - return request('patch', url, data=data, **kwargs) - - -def delete(url, **kwargs): - return request('delete', url, **kwargs) - - -async def stream_chunk_to_queue(network, queue, method, url, **kwargs): - try: - async with await network.stream(method, url, **kwargs) as response: - queue.put(response) - # aiter_raw: access the raw bytes on the response without applying any HTTP content decoding - # https://www.python-httpx.org/quickstart/#streaming-responses - async for chunk in response.aiter_raw(65536): - if len(chunk) > 0: - queue.put(chunk) - except (httpx.StreamClosed, anyio.ClosedResourceError): - # the response was queued before the exception. - # the exception was raised on aiter_raw. - # we do nothing here: in the finally block, None will be queued - # so stream(method, url, **kwargs) generator can stop - pass - except Exception as e: # pylint: disable=broad-except - # broad except to avoid this scenario: - # exception in network.stream(method, url, **kwargs) - # -> the exception is not catch here - # -> queue None (in finally) - # -> the function below steam(method, url, **kwargs) has nothing to return - queue.put(e) - finally: - queue.put(None) - - -def _stream_generator(method, url, **kwargs): - queue = SimpleQueue() - network = get_context_network() - future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(network, queue, method, url, **kwargs), get_loop()) - - # yield chunks - obj_or_exception = queue.get() - while obj_or_exception is not None: - if isinstance(obj_or_exception, Exception): - raise obj_or_exception - yield obj_or_exception - obj_or_exception = queue.get() - future.result() - - -def _close_response_method(self): - asyncio.run_coroutine_threadsafe(self.aclose(), get_loop()) - # reach the end of _self.generator ( _stream_generator ) to an avoid memory leak. - # it makes sure that : - # * the httpx response is closed (see the stream_chunk_to_queue function) - # * to call future.result() in _stream_generator - for _ in self._generator: # pylint: disable=protected-access - continue - - -def stream(method, url, **kwargs) -> Tuple[httpx.Response, Iterable[bytes]]: - """Replace httpx.stream. - - Usage: - response, stream = poolrequests.stream(...) - for chunk in stream: - ... - - httpx.Client.stream requires to write the httpx.HTTPTransport version of the - the httpx.AsyncHTTPTransport declared above. - """ - generator = _stream_generator(method, url, **kwargs) - - # yield response - response = next(generator) # pylint: disable=stop-iteration-return - if isinstance(response, Exception): - raise response - - response._generator = generator # pylint: disable=protected-access - response.close = MethodType(_close_response_method, response) - - return response, generator diff --git a/apps/searxng/searx/network/client.py b/apps/searxng/searx/network/client.py deleted file mode 100755 index ffee3f0..0000000 --- a/apps/searxng/searx/network/client.py +++ /dev/null @@ -1,200 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring, global-statement - -import asyncio -import logging -import random -from ssl import SSLContext -import threading -from typing import Any, Dict - -import httpx -from httpx_socks import AsyncProxyTransport -from python_socks import parse_proxy_url, ProxyConnectionError, ProxyTimeoutError, ProxyError - -from searx import logger - -# Optional uvloop (support Python 3.6) -try: - import uvloop -except ImportError: - pass -else: - uvloop.install() - - -logger = logger.getChild('searx.network.client') -LOOP = None -SSLCONTEXTS: Dict[Any, SSLContext] = {} - - -def shuffle_ciphers(ssl_context): - """Shuffle httpx's default ciphers of a SSL context randomly. - - From `What Is TLS Fingerprint and How to Bypass It`_ - - > When implementing TLS fingerprinting, servers can't operate based on a - > locked-in whitelist database of fingerprints. New fingerprints appear - > when web clients or TLS libraries release new versions. So, they have to - > live off a blocklist database instead. - > ... - > It's safe to leave the first three as is but shuffle the remaining ciphers - > and you can bypass the TLS fingerprint check. - - .. _What Is TLS Fingerprint and How to Bypass It: - https://www.zenrows.com/blog/what-is-tls-fingerprint#how-to-bypass-tls-fingerprinting - - """ - c_list = httpx._config.DEFAULT_CIPHERS.split(':') # pylint: disable=protected-access - sc_list, c_list = c_list[:3], c_list[3:] - random.shuffle(c_list) - ssl_context.set_ciphers(":".join(sc_list + c_list)) - - -def get_sslcontexts(proxy_url=None, cert=None, verify=True, trust_env=True, http2=False): - key = (proxy_url, cert, verify, trust_env, http2) - if key not in SSLCONTEXTS: - SSLCONTEXTS[key] = httpx.create_ssl_context(cert, verify, trust_env, http2) - shuffle_ciphers(SSLCONTEXTS[key]) - return SSLCONTEXTS[key] - - -class AsyncHTTPTransportNoHttp(httpx.AsyncHTTPTransport): - """Block HTTP request""" - - async def handle_async_request(self, request): - raise httpx.UnsupportedProtocol('HTTP protocol is disabled') - - -class AsyncProxyTransportFixed(AsyncProxyTransport): - """Fix httpx_socks.AsyncProxyTransport - - Map python_socks exceptions to httpx.ProxyError exceptions - """ - - async def handle_async_request(self, request): - try: - return await super().handle_async_request(request) - except ProxyConnectionError as e: - raise httpx.ProxyError("ProxyConnectionError: " + e.strerror, request=request) from e - except ProxyTimeoutError as e: - raise httpx.ProxyError("ProxyTimeoutError: " + e.args[0], request=request) from e - except ProxyError as e: - raise httpx.ProxyError("ProxyError: " + e.args[0], request=request) from e - - -def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit, retries): - # support socks5h (requests compatibility): - # https://requests.readthedocs.io/en/master/user/advanced/#socks - # socks5:// hostname is resolved on client side - # socks5h:// hostname is resolved on proxy side - rdns = False - socks5h = 'socks5h://' - if proxy_url.startswith(socks5h): - proxy_url = 'socks5://' + proxy_url[len(socks5h) :] - rdns = True - - proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url) - verify = get_sslcontexts(proxy_url, None, verify, True, http2) if verify is True else verify - return AsyncProxyTransportFixed( - proxy_type=proxy_type, - proxy_host=proxy_host, - proxy_port=proxy_port, - username=proxy_username, - password=proxy_password, - rdns=rdns, - loop=get_loop(), - verify=verify, - http2=http2, - local_address=local_address, - limits=limit, - retries=retries, - ) - - -def get_transport(verify, http2, local_address, proxy_url, limit, retries): - verify = get_sslcontexts(None, None, verify, True, http2) if verify is True else verify - return httpx.AsyncHTTPTransport( - # pylint: disable=protected-access - verify=verify, - http2=http2, - limits=limit, - proxy=httpx._config.Proxy(proxy_url) if proxy_url else None, - local_address=local_address, - retries=retries, - ) - - -def new_client( - # pylint: disable=too-many-arguments - enable_http, - verify, - enable_http2, - max_connections, - max_keepalive_connections, - keepalive_expiry, - proxies, - local_address, - retries, - max_redirects, - hook_log_response, -): - limit = httpx.Limits( - max_connections=max_connections, - max_keepalive_connections=max_keepalive_connections, - keepalive_expiry=keepalive_expiry, - ) - # See https://www.python-httpx.org/advanced/#routing - mounts = {} - for pattern, proxy_url in proxies.items(): - if not enable_http and pattern.startswith('http://'): - continue - if proxy_url.startswith('socks4://') or proxy_url.startswith('socks5://') or proxy_url.startswith('socks5h://'): - mounts[pattern] = get_transport_for_socks_proxy( - verify, enable_http2, local_address, proxy_url, limit, retries - ) - else: - mounts[pattern] = get_transport(verify, enable_http2, local_address, proxy_url, limit, retries) - - if not enable_http: - mounts['http://'] = AsyncHTTPTransportNoHttp() - - transport = get_transport(verify, enable_http2, local_address, None, limit, retries) - - event_hooks = None - if hook_log_response: - event_hooks = {'response': [hook_log_response]} - - return httpx.AsyncClient( - transport=transport, - mounts=mounts, - max_redirects=max_redirects, - event_hooks=event_hooks, - ) - - -def get_loop(): - return LOOP - - -def init(): - # log - for logger_name in ('hpack.hpack', 'hpack.table', 'httpx._client'): - logging.getLogger(logger_name).setLevel(logging.WARNING) - - # loop - def loop_thread(): - global LOOP - LOOP = asyncio.new_event_loop() - LOOP.run_forever() - - thread = threading.Thread( - target=loop_thread, - name='asyncio_loop', - daemon=True, - ) - thread.start() - - -init() diff --git a/apps/searxng/searx/network/network.py b/apps/searxng/searx/network/network.py deleted file mode 100755 index 6e1825d..0000000 --- a/apps/searxng/searx/network/network.py +++ /dev/null @@ -1,427 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=global-statement -# pylint: disable=missing-module-docstring, missing-class-docstring - -import atexit -import asyncio -import ipaddress -from itertools import cycle -from typing import Dict - -import httpx - -from searx import logger, searx_debug -from .client import new_client, get_loop, AsyncHTTPTransportNoHttp -from .raise_for_httperror import raise_for_httperror - - -logger = logger.getChild('network') -DEFAULT_NAME = '__DEFAULT__' -NETWORKS: Dict[str, 'Network'] = {} -# requests compatibility when reading proxy settings from settings.yml -PROXY_PATTERN_MAPPING = { - 'http': 'http://', - 'https': 'https://', - 'socks4': 'socks4://', - 'socks5': 'socks5://', - 'socks5h': 'socks5h://', - 'http:': 'http://', - 'https:': 'https://', - 'socks4:': 'socks4://', - 'socks5:': 'socks5://', - 'socks5h:': 'socks5h://', -} - -ADDRESS_MAPPING = {'ipv4': '0.0.0.0', 'ipv6': '::'} - - -class Network: - - __slots__ = ( - 'enable_http', - 'verify', - 'enable_http2', - 'max_connections', - 'max_keepalive_connections', - 'keepalive_expiry', - 'local_addresses', - 'proxies', - 'using_tor_proxy', - 'max_redirects', - 'retries', - 'retry_on_http_error', - '_local_addresses_cycle', - '_proxies_cycle', - '_clients', - '_logger', - ) - - _TOR_CHECK_RESULT = {} - - def __init__( - # pylint: disable=too-many-arguments - self, - enable_http=True, - verify=True, - enable_http2=False, - max_connections=None, - max_keepalive_connections=None, - keepalive_expiry=None, - proxies=None, - using_tor_proxy=False, - local_addresses=None, - retries=0, - retry_on_http_error=None, - max_redirects=30, - logger_name=None, - ): - - self.enable_http = enable_http - self.verify = verify - self.enable_http2 = enable_http2 - self.max_connections = max_connections - self.max_keepalive_connections = max_keepalive_connections - self.keepalive_expiry = keepalive_expiry - self.proxies = proxies - self.using_tor_proxy = using_tor_proxy - self.local_addresses = local_addresses - self.retries = retries - self.retry_on_http_error = retry_on_http_error - self.max_redirects = max_redirects - self._local_addresses_cycle = self.get_ipaddress_cycle() - self._proxies_cycle = self.get_proxy_cycles() - self._clients = {} - self._logger = logger.getChild(logger_name) if logger_name else logger - self.check_parameters() - - def check_parameters(self): - for address in self.iter_ipaddresses(): - if '/' in address: - ipaddress.ip_network(address, False) - else: - ipaddress.ip_address(address) - - if self.proxies is not None and not isinstance(self.proxies, (str, dict)): - raise ValueError('proxies type has to be str, dict or None') - - def iter_ipaddresses(self): - local_addresses = self.local_addresses - if not local_addresses: - return - if isinstance(local_addresses, str): - local_addresses = [local_addresses] - for address in local_addresses: - yield address - - def get_ipaddress_cycle(self): - while True: - count = 0 - for address in self.iter_ipaddresses(): - if '/' in address: - for a in ipaddress.ip_network(address, False).hosts(): - yield str(a) - count += 1 - else: - a = ipaddress.ip_address(address) - yield str(a) - count += 1 - if count == 0: - yield None - - def iter_proxies(self): - if not self.proxies: - return - # https://www.python-httpx.org/compatibility/#proxy-keys - if isinstance(self.proxies, str): - yield 'all://', [self.proxies] - else: - for pattern, proxy_url in self.proxies.items(): - pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern) - if isinstance(proxy_url, str): - proxy_url = [proxy_url] - yield pattern, proxy_url - - def get_proxy_cycles(self): - proxy_settings = {} - for pattern, proxy_urls in self.iter_proxies(): - proxy_settings[pattern] = cycle(proxy_urls) - while True: - # pylint: disable=stop-iteration-return - yield tuple((pattern, next(proxy_url_cycle)) for pattern, proxy_url_cycle in proxy_settings.items()) - - async def log_response(self, response: httpx.Response): - request = response.request - status = f"{response.status_code} {response.reason_phrase}" - response_line = f"{response.http_version} {status}" - content_type = response.headers.get("Content-Type") - content_type = f' ({content_type})' if content_type else '' - self._logger.debug(f'HTTP Request: {request.method} {request.url} "{response_line}"{content_type}') - - @staticmethod - async def check_tor_proxy(client: httpx.AsyncClient, proxies) -> bool: - if proxies in Network._TOR_CHECK_RESULT: - return Network._TOR_CHECK_RESULT[proxies] - - result = True - # ignore client._transport because it is not used with all:// - for transport in client._mounts.values(): # pylint: disable=protected-access - if isinstance(transport, AsyncHTTPTransportNoHttp): - continue - if getattr(transport, "_pool") and getattr( - transport._pool, "_rdns", False # pylint: disable=protected-access - ): - continue - return False - response = await client.get("https://check.torproject.org/api/ip", timeout=60) - if not response.json()["IsTor"]: - result = False - Network._TOR_CHECK_RESULT[proxies] = result - return result - - async def get_client(self, verify=None, max_redirects=None): - verify = self.verify if verify is None else verify - max_redirects = self.max_redirects if max_redirects is None else max_redirects - local_address = next(self._local_addresses_cycle) - proxies = next(self._proxies_cycle) # is a tuple so it can be part of the key - key = (verify, max_redirects, local_address, proxies) - hook_log_response = self.log_response if searx_debug else None - if key not in self._clients or self._clients[key].is_closed: - client = new_client( - self.enable_http, - verify, - self.enable_http2, - self.max_connections, - self.max_keepalive_connections, - self.keepalive_expiry, - dict(proxies), - local_address, - 0, - max_redirects, - hook_log_response, - ) - if self.using_tor_proxy and not await self.check_tor_proxy(client, proxies): - await client.aclose() - raise httpx.ProxyError('Network configuration problem: not using Tor') - self._clients[key] = client - return self._clients[key] - - async def aclose(self): - async def close_client(client): - try: - await client.aclose() - except httpx.HTTPError: - pass - - await asyncio.gather(*[close_client(client) for client in self._clients.values()], return_exceptions=False) - - @staticmethod - def extract_kwargs_clients(kwargs): - kwargs_clients = {} - if 'verify' in kwargs: - kwargs_clients['verify'] = kwargs.pop('verify') - if 'max_redirects' in kwargs: - kwargs_clients['max_redirects'] = kwargs.pop('max_redirects') - if 'allow_redirects' in kwargs: - # see https://github.com/encode/httpx/pull/1808 - kwargs['follow_redirects'] = kwargs.pop('allow_redirects') - return kwargs_clients - - @staticmethod - def extract_do_raise_for_httperror(kwargs): - do_raise_for_httperror = True - if 'raise_for_httperror' in kwargs: - do_raise_for_httperror = kwargs['raise_for_httperror'] - del kwargs['raise_for_httperror'] - return do_raise_for_httperror - - @staticmethod - def patch_response(response, do_raise_for_httperror): - if isinstance(response, httpx.Response): - # requests compatibility (response is not streamed) - # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses - response.ok = not response.is_error - - # raise an exception - if do_raise_for_httperror: - raise_for_httperror(response) - - return response - - def is_valid_response(self, response): - # pylint: disable=too-many-boolean-expressions - if ( - (self.retry_on_http_error is True and 400 <= response.status_code <= 599) - or (isinstance(self.retry_on_http_error, list) and response.status_code in self.retry_on_http_error) - or (isinstance(self.retry_on_http_error, int) and response.status_code == self.retry_on_http_error) - ): - return False - return True - - async def call_client(self, stream, method, url, **kwargs): - retries = self.retries - was_disconnected = False - do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs) - kwargs_clients = Network.extract_kwargs_clients(kwargs) - while retries >= 0: # pragma: no cover - client = await self.get_client(**kwargs_clients) - try: - if stream: - response = client.stream(method, url, **kwargs) - else: - response = await client.request(method, url, **kwargs) - if self.is_valid_response(response) or retries <= 0: - return Network.patch_response(response, do_raise_for_httperror) - except httpx.RemoteProtocolError as e: - if not was_disconnected: - # the server has closed the connection: - # try again without decreasing the retries variable & with a new HTTP client - was_disconnected = True - await client.aclose() - self._logger.warning('httpx.RemoteProtocolError: the server has disconnected, retrying') - continue - if retries <= 0: - raise e - except (httpx.RequestError, httpx.HTTPStatusError) as e: - if retries <= 0: - raise e - retries -= 1 - - async def request(self, method, url, **kwargs): - return await self.call_client(False, method, url, **kwargs) - - async def stream(self, method, url, **kwargs): - return await self.call_client(True, method, url, **kwargs) - - @classmethod - async def aclose_all(cls): - await asyncio.gather(*[network.aclose() for network in NETWORKS.values()], return_exceptions=False) - - -def get_network(name=None): - return NETWORKS.get(name or DEFAULT_NAME) - - -def check_network_configuration(): - async def check(): - exception_count = 0 - for network in NETWORKS.values(): - if network.using_tor_proxy: - try: - await network.get_client() - except Exception: # pylint: disable=broad-except - network._logger.exception('Error') # pylint: disable=protected-access - exception_count += 1 - return exception_count - - future = asyncio.run_coroutine_threadsafe(check(), get_loop()) - exception_count = future.result() - if exception_count > 0: - raise RuntimeError("Invalid network configuration") - - -def initialize(settings_engines=None, settings_outgoing=None): - # pylint: disable=import-outside-toplevel) - from searx.engines import engines - from searx import settings - - # pylint: enable=import-outside-toplevel) - - settings_engines = settings_engines or settings['engines'] - settings_outgoing = settings_outgoing or settings['outgoing'] - - # default parameters for AsyncHTTPTransport - # see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # pylint: disable=line-too-long - default_params = { - 'enable_http': False, - 'verify': settings_outgoing['verify'], - 'enable_http2': settings_outgoing['enable_http2'], - 'max_connections': settings_outgoing['pool_connections'], - 'max_keepalive_connections': settings_outgoing['pool_maxsize'], - 'keepalive_expiry': settings_outgoing['keepalive_expiry'], - 'local_addresses': settings_outgoing['source_ips'], - 'using_tor_proxy': settings_outgoing['using_tor_proxy'], - 'proxies': settings_outgoing['proxies'], - 'max_redirects': settings_outgoing['max_redirects'], - 'retries': settings_outgoing['retries'], - 'retry_on_http_error': None, - } - - def new_network(params, logger_name=None): - nonlocal default_params - result = {} - result.update(default_params) - result.update(params) - if logger_name: - result['logger_name'] = logger_name - return Network(**result) - - def iter_networks(): - nonlocal settings_engines - for engine_spec in settings_engines: - engine_name = engine_spec['name'] - engine = engines.get(engine_name) - if engine is None: - continue - network = getattr(engine, 'network', None) - yield engine_name, engine, network - - if NETWORKS: - done() - NETWORKS.clear() - NETWORKS[DEFAULT_NAME] = new_network({}, logger_name='default') - NETWORKS['ipv4'] = new_network({'local_addresses': '0.0.0.0'}, logger_name='ipv4') - NETWORKS['ipv6'] = new_network({'local_addresses': '::'}, logger_name='ipv6') - - # define networks from outgoing.networks - for network_name, network in settings_outgoing['networks'].items(): - NETWORKS[network_name] = new_network(network, logger_name=network_name) - - # define networks from engines.[i].network (except references) - for engine_name, engine, network in iter_networks(): - if network is None: - network = {} - for attribute_name, attribute_value in default_params.items(): - if hasattr(engine, attribute_name): - network[attribute_name] = getattr(engine, attribute_name) - else: - network[attribute_name] = attribute_value - NETWORKS[engine_name] = new_network(network, logger_name=engine_name) - elif isinstance(network, dict): - NETWORKS[engine_name] = new_network(network, logger_name=engine_name) - - # define networks from engines.[i].network (references) - for engine_name, engine, network in iter_networks(): - if isinstance(network, str): - NETWORKS[engine_name] = NETWORKS[network] - - # the /image_proxy endpoint has a dedicated network. - # same parameters than the default network, but HTTP/2 is disabled. - # It decreases the CPU load average, and the total time is more or less the same - if 'image_proxy' not in NETWORKS: - image_proxy_params = default_params.copy() - image_proxy_params['enable_http2'] = False - NETWORKS['image_proxy'] = new_network(image_proxy_params, logger_name='image_proxy') - - -@atexit.register -def done(): - """Close all HTTP client - - Avoid a warning at exit - see https://github.com/encode/httpx/blob/1a6e254f72d9fd5694a1c10a28927e193ab4f76b/httpx/_client.py#L1785 - - Note: since Network.aclose has to be async, it is not possible to call this method on Network.__del__ - So Network.aclose is called here using atexit.register - """ - try: - loop = get_loop() - if loop: - future = asyncio.run_coroutine_threadsafe(Network.aclose_all(), loop) - # wait 3 seconds to close the HTTP clients - future.result(3) - finally: - NETWORKS.clear() - - -NETWORKS[DEFAULT_NAME] = Network() diff --git a/apps/searxng/searx/network/raise_for_httperror.py b/apps/searxng/searx/network/raise_for_httperror.py deleted file mode 100755 index 9f847d4..0000000 --- a/apps/searxng/searx/network/raise_for_httperror.py +++ /dev/null @@ -1,78 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Raise exception for an HTTP response is an error. - -""" - -from searx.exceptions import ( - SearxEngineCaptchaException, - SearxEngineTooManyRequestsException, - SearxEngineAccessDeniedException, -) -from searx import get_setting - - -def is_cloudflare_challenge(resp): - if resp.status_code in [429, 503]: - if ('__cf_chl_jschl_tk__=' in resp.text) or ( - '/cdn-cgi/challenge-platform/' in resp.text - and 'orchestrate/jsch/v1' in resp.text - and 'window._cf_chl_enter(' in resp.text - ): - return True - if resp.status_code == 403 and '__cf_chl_captcha_tk__=' in resp.text: - return True - return False - - -def is_cloudflare_firewall(resp): - return resp.status_code == 403 and '1020' in resp.text - - -def raise_for_cloudflare_captcha(resp): - if resp.headers.get('Server', '').startswith('cloudflare'): - if is_cloudflare_challenge(resp): - # https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha- - # suspend for 2 weeks - raise SearxEngineCaptchaException( - message='Cloudflare CAPTCHA', suspended_time=get_setting('search.suspended_times.cf_SearxEngineCaptcha') - ) - - if is_cloudflare_firewall(resp): - raise SearxEngineAccessDeniedException( - message='Cloudflare Firewall', - suspended_time=get_setting('search.suspended_times.cf_SearxEngineAccessDenied'), - ) - - -def raise_for_recaptcha(resp): - if resp.status_code == 503 and '"https://www.google.com/recaptcha/' in resp.text: - raise SearxEngineCaptchaException( - message='ReCAPTCHA', suspended_time=get_setting('search.suspended_times.recaptcha_SearxEngineCaptcha') - ) - - -def raise_for_captcha(resp): - raise_for_cloudflare_captcha(resp) - raise_for_recaptcha(resp) - - -def raise_for_httperror(resp): - """Raise exception for an HTTP response is an error. - - Args: - resp (requests.Response): Response to check - - Raises: - requests.HTTPError: raise by resp.raise_for_status() - searx.exceptions.SearxEngineAccessDeniedException: raise when the HTTP status code is 402 or 403. - searx.exceptions.SearxEngineTooManyRequestsException: raise when the HTTP status code is 429. - searx.exceptions.SearxEngineCaptchaException: raise when if CATPCHA challenge is detected. - """ - if resp.status_code and resp.status_code >= 400: - raise_for_captcha(resp) - if resp.status_code in (402, 403): - raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code)) - if resp.status_code == 429: - raise SearxEngineTooManyRequestsException() - resp.raise_for_status() diff --git a/apps/searxng/searx/plugins/__init__.py b/apps/searxng/searx/plugins/__init__.py deleted file mode 100755 index 8ece943..0000000 --- a/apps/searxng/searx/plugins/__init__.py +++ /dev/null @@ -1,234 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring, missing-class-docstring - -import sys -from hashlib import sha256 -from importlib import import_module -from os import listdir, makedirs, remove, stat, utime -from os.path import abspath, basename, dirname, exists, join -from shutil import copyfile -from pkgutil import iter_modules -from logging import getLogger -from typing import List, Tuple - -from searx import logger, settings - - -class Plugin: # pylint: disable=too-few-public-methods - """This class is currently never initialized and only used for type hinting.""" - - id: str - name: str - description: str - default_on: bool - js_dependencies: Tuple[str] - css_dependencies: Tuple[str] - preference_section: str - - -logger = logger.getChild("plugins") - -required_attrs = ( - # fmt: off - ("name", str), - ("description", str), - ("default_on", bool) - # fmt: on -) - -optional_attrs = ( - # fmt: off - ("js_dependencies", tuple), - ("css_dependencies", tuple), - ("preference_section", str), - # fmt: on -) - - -def sha_sum(filename): - with open(filename, "rb") as f: - file_content_bytes = f.read() - return sha256(file_content_bytes).hexdigest() - - -def sync_resource(base_path, resource_path, name, target_dir, plugin_dir): - dep_path = join(base_path, resource_path) - file_name = basename(dep_path) - resource_path = join(target_dir, file_name) - if not exists(resource_path) or sha_sum(dep_path) != sha_sum(resource_path): - try: - copyfile(dep_path, resource_path) - # copy atime_ns and mtime_ns, so the weak ETags (generated by - # the HTTP server) do not change - dep_stat = stat(dep_path) - utime(resource_path, ns=(dep_stat.st_atime_ns, dep_stat.st_mtime_ns)) - except IOError: - logger.critical("failed to copy plugin resource {0} for plugin {1}".format(file_name, name)) - sys.exit(3) - - # returning with the web path of the resource - return join("plugins/external_plugins", plugin_dir, file_name) - - -def prepare_package_resources(plugin, plugin_module_name): - plugin_base_path = dirname(abspath(plugin.__file__)) - - plugin_dir = plugin_module_name - target_dir = join(settings["ui"]["static_path"], "plugins/external_plugins", plugin_dir) - try: - makedirs(target_dir, exist_ok=True) - except IOError: - logger.critical("failed to create resource directory {0} for plugin {1}".format(target_dir, plugin_module_name)) - sys.exit(3) - - resources = [] - - if hasattr(plugin, "js_dependencies"): - resources.extend(map(basename, plugin.js_dependencies)) - plugin.js_dependencies = [ - sync_resource(plugin_base_path, x, plugin_module_name, target_dir, plugin_dir) - for x in plugin.js_dependencies - ] - - if hasattr(plugin, "css_dependencies"): - resources.extend(map(basename, plugin.css_dependencies)) - plugin.css_dependencies = [ - sync_resource(plugin_base_path, x, plugin_module_name, target_dir, plugin_dir) - for x in plugin.css_dependencies - ] - - for f in listdir(target_dir): - if basename(f) not in resources: - resource_path = join(target_dir, basename(f)) - try: - remove(resource_path) - except IOError: - logger.critical( - "failed to remove unused resource file {0} for plugin {1}".format(resource_path, plugin_module_name) - ) - sys.exit(3) - - -def load_plugin(plugin_module_name, external): - # pylint: disable=too-many-branches - try: - plugin = import_module(plugin_module_name) - except ( - SyntaxError, - KeyboardInterrupt, - SystemExit, - SystemError, - ImportError, - RuntimeError, - ) as e: - logger.critical("%s: fatal exception", plugin_module_name, exc_info=e) - sys.exit(3) - except BaseException: - logger.exception("%s: exception while loading, the plugin is disabled", plugin_module_name) - return None - - # difference with searx: use module name instead of the user name - plugin.id = plugin_module_name - - # - plugin.logger = getLogger(plugin_module_name) - - for plugin_attr, plugin_attr_type in required_attrs: - if not hasattr(plugin, plugin_attr): - logger.critical('%s: missing attribute "%s", cannot load plugin', plugin, plugin_attr) - sys.exit(3) - attr = getattr(plugin, plugin_attr) - if not isinstance(attr, plugin_attr_type): - type_attr = str(type(attr)) - logger.critical( - '{1}: attribute "{0}" is of type {2}, must be of type {3}, cannot load plugin'.format( - plugin, plugin_attr, type_attr, plugin_attr_type - ) - ) - sys.exit(3) - - for plugin_attr, plugin_attr_type in optional_attrs: - if not hasattr(plugin, plugin_attr) or not isinstance(getattr(plugin, plugin_attr), plugin_attr_type): - setattr(plugin, plugin_attr, plugin_attr_type()) - - if not hasattr(plugin, "preference_section"): - plugin.preference_section = "general" - - # query plugin - if plugin.preference_section == "query": - for plugin_attr in ("query_keywords", "query_examples"): - if not hasattr(plugin, plugin_attr): - logger.critical('missing attribute "{0}", cannot load plugin: {1}'.format(plugin_attr, plugin)) - sys.exit(3) - - if settings.get("enabled_plugins"): - # searx compatibility: plugin.name in settings['enabled_plugins'] - plugin.default_on = plugin.name in settings["enabled_plugins"] or plugin.id in settings["enabled_plugins"] - - # copy ressources if this is an external plugin - if external: - prepare_package_resources(plugin, plugin_module_name) - - logger.debug("%s: loaded", plugin_module_name) - - return plugin - - -def load_and_initialize_plugin(plugin_module_name, external, init_args): - plugin = load_plugin(plugin_module_name, external) - if plugin and hasattr(plugin, 'init'): - try: - return plugin if plugin.init(*init_args) else None - except Exception: # pylint: disable=broad-except - plugin.logger.exception("Exception while calling init, the plugin is disabled") - return None - return plugin - - -class PluginStore: - def __init__(self): - self.plugins: List[Plugin] = [] - - def __iter__(self): - for plugin in self.plugins: - yield plugin - - def register(self, plugin): - self.plugins.append(plugin) - - def call(self, ordered_plugin_list, plugin_type, *args, **kwargs): - ret = True - for plugin in ordered_plugin_list: - if hasattr(plugin, plugin_type): - try: - ret = getattr(plugin, plugin_type)(*args, **kwargs) - if not ret: - break - except Exception: # pylint: disable=broad-except - plugin.logger.exception("Exception while calling %s", plugin_type) - return ret - - -plugins = PluginStore() - - -def plugin_module_names(): - yield_plugins = set() - - # embedded plugins - for module in iter_modules(path=[dirname(__file__)]): - yield (__name__ + "." + module.name, False) - yield_plugins.add(module.name) - # external plugins - for module_name in settings['plugins']: - if module_name not in yield_plugins: - yield (module_name, True) - yield_plugins.add(module_name) - - -def initialize(app): - for module_name, external in plugin_module_names(): - plugin = load_and_initialize_plugin(module_name, external, (app, settings)) - if plugin: - plugins.register(plugin) diff --git a/apps/searxng/searx/plugins/ahmia_filter.py b/apps/searxng/searx/plugins/ahmia_filter.py deleted file mode 100755 index 326da9c..0000000 --- a/apps/searxng/searx/plugins/ahmia_filter.py +++ /dev/null @@ -1,29 +0,0 @@ -''' - SPDX-License-Identifier: AGPL-3.0-or-later -''' - -from hashlib import md5 -from searx.data import ahmia_blacklist_loader - -name = "Ahmia blacklist" -description = "Filter out onion results that appear in Ahmia's blacklist. (See https://ahmia.fi/blacklist)" -default_on = True -preference_section = 'onions' - -ahmia_blacklist = None - - -def on_result(request, search, result): - if not result.get('is_onion') or not result.get('parsed_url'): - return True - result_hash = md5(result['parsed_url'].hostname.encode()).hexdigest() - return result_hash not in ahmia_blacklist - - -def init(app, settings): - global ahmia_blacklist # pylint: disable=global-statement - if not settings['outgoing']['using_tor_proxy']: - # disable the plugin - return False - ahmia_blacklist = ahmia_blacklist_loader() - return True diff --git a/apps/searxng/searx/plugins/hash_plugin.py b/apps/searxng/searx/plugins/hash_plugin.py deleted file mode 100755 index edb91dd..0000000 --- a/apps/searxng/searx/plugins/hash_plugin.py +++ /dev/null @@ -1,57 +0,0 @@ -''' -searx is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -searx is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with searx. If not, see < http://www.gnu.org/licenses/ >. - -(C) 2015 by Adam Tauber, -(C) 2018, 2020 by Vaclav Zouzalik -''' - -from flask_babel import gettext -import hashlib -import re - -name = "Hash plugin" -description = gettext("Converts strings to different hash digests.") -default_on = True -preference_section = 'query' -query_keywords = ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'] -query_examples = 'sha512 The quick brown fox jumps over the lazy dog' - -parser_re = re.compile('(md5|sha1|sha224|sha256|sha384|sha512) (.*)', re.I) - - -def post_search(request, search): - # process only on first page - if search.search_query.pageno > 1: - return True - m = parser_re.match(search.search_query.query) - if not m: - # wrong query - return True - - function, string = m.groups() - if string.strip().__len__() == 0: - # end if the string is empty - return True - - # select hash function - f = hashlib.new(function.lower()) - - # make digest from the given string - f.update(string.encode('utf-8').strip()) - answer = function + " " + gettext('hash digest') + ": " + f.hexdigest() - - # print result - search.result_container.answers.clear() - search.result_container.answers['hash'] = {'answer': answer} - return True diff --git a/apps/searxng/searx/plugins/hostname_replace.py b/apps/searxng/searx/plugins/hostname_replace.py deleted file mode 100755 index 039aadb..0000000 --- a/apps/searxng/searx/plugins/hostname_replace.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -import re -from urllib.parse import urlunparse, urlparse -from searx import settings -from searx.plugins import logger -from flask_babel import gettext - -name = gettext('Hostname replace') -description = gettext('Rewrite result hostnames or remove results based on the hostname') -default_on = False -preference_section = 'general' - -plugin_id = 'hostname_replace' - -replacements = {re.compile(p): r for (p, r) in settings[plugin_id].items()} if plugin_id in settings else {} - -logger = logger.getChild(plugin_id) -parsed = 'parsed_url' -_url_fields = ['iframe_src', 'audio_src'] - - -def on_result(request, search, result): - - for (pattern, replacement) in replacements.items(): - - if parsed in result: - if pattern.search(result[parsed].netloc): - # to keep or remove this result from the result list depends - # (only) on the 'parsed_url' - if not replacement: - return False - result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc)) - result['url'] = urlunparse(result[parsed]) - - for url_field in _url_fields: - if result.get(url_field): - url_src = urlparse(result[url_field]) - if pattern.search(url_src.netloc): - if not replacement: - del result[url_field] - else: - url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc)) - result[url_field] = urlunparse(url_src) - - return True diff --git a/apps/searxng/searx/plugins/limiter.py b/apps/searxng/searx/plugins/limiter.py deleted file mode 100755 index a8beb5e..0000000 --- a/apps/searxng/searx/plugins/limiter.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pyright: basic -"""see :ref:`limiter src`""" - -import flask - -from searx import redisdb -from searx.plugins import logger -from searx.botdetection import limiter - -name = "Request limiter" -description = "Limit the number of request" -default_on = False -preference_section = 'service' - -logger = logger.getChild('limiter') - - -def pre_request(): - """See :ref:`flask.Flask.before_request`""" - return limiter.filter_request(flask.request) - - -def init(app: flask.Flask, settings) -> bool: - if not settings['server']['limiter']: - return False - if not redisdb.client(): - logger.error("The limiter requires Redis") - return False - app.before_request(pre_request) - return True diff --git a/apps/searxng/searx/plugins/oa_doi_rewrite.py b/apps/searxng/searx/plugins/oa_doi_rewrite.py deleted file mode 100755 index f0e0773..0000000 --- a/apps/searxng/searx/plugins/oa_doi_rewrite.py +++ /dev/null @@ -1,47 +0,0 @@ -from urllib.parse import urlparse, parse_qsl -from flask_babel import gettext -import re -from searx import settings - - -regex = re.compile(r'10\.\d{4,9}/[^\s]+') - -name = gettext('Open Access DOI rewrite') -description = gettext('Avoid paywalls by redirecting to open-access versions of publications when available') -default_on = False -preference_section = 'general' - - -def extract_doi(url): - match = regex.search(url.path) - if match: - return match.group(0) - for _, v in parse_qsl(url.query): - match = regex.search(v) - if match: - return match.group(0) - return None - - -def get_doi_resolver(preferences): - doi_resolvers = settings['doi_resolvers'] - selected_resolver = preferences.get_value('doi_resolver')[0] - if selected_resolver not in doi_resolvers: - selected_resolver = settings['default_doi_resolver'] - return doi_resolvers[selected_resolver] - - -def on_result(request, search, result): - if 'parsed_url' not in result: - return True - - doi = extract_doi(result['parsed_url']) - if doi and len(doi) < 50: - for suffix in ('/', '.pdf', '.xml', '/full', '/meta', '/abstract'): - if doi.endswith(suffix): - doi = doi[: -len(suffix)] - result['url'] = get_doi_resolver(request.preferences) + doi - result['parsed_url'] = urlparse(result['url']) - if 'doi' not in result: - result['doi'] = doi - return True diff --git a/apps/searxng/searx/plugins/search_on_category_select.py b/apps/searxng/searx/plugins/search_on_category_select.py deleted file mode 100755 index 85b73a9..0000000 --- a/apps/searxng/searx/plugins/search_on_category_select.py +++ /dev/null @@ -1,24 +0,0 @@ -''' -searx is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -searx is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with searx. If not, see < http://www.gnu.org/licenses/ >. - -(C) 2015 by Adam Tauber, -''' -from flask_babel import gettext - -name = gettext('Search on category select') -description = gettext( - 'Perform search immediately if a category selected. Disable to select multiple categories. (JavaScript required)' -) -default_on = True -preference_section = 'ui' diff --git a/apps/searxng/searx/plugins/self_info.py b/apps/searxng/searx/plugins/self_info.py deleted file mode 100755 index 8079ee0..0000000 --- a/apps/searxng/searx/plugins/self_info.py +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring,invalid-name - -import re -from flask_babel import gettext - -from searx.botdetection._helpers import get_real_ip - -name = gettext('Self Information') -description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".') -default_on = True -preference_section = 'query' -query_keywords = ['user-agent'] -query_examples = '' - -# Self User Agent regex -p = re.compile('.*user[ -]agent.*', re.IGNORECASE) - - -def post_search(request, search): - if search.search_query.pageno > 1: - return True - if search.search_query.query == 'ip': - ip = get_real_ip(request) - search.result_container.answers['ip'] = {'answer': ip} - elif p.match(search.search_query.query): - ua = request.user_agent - search.result_container.answers['user-agent'] = {'answer': ua} - return True diff --git a/apps/searxng/searx/plugins/tor_check.py b/apps/searxng/searx/plugins/tor_check.py deleted file mode 100755 index 831c90c..0000000 --- a/apps/searxng/searx/plugins/tor_check.py +++ /dev/null @@ -1,92 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""A plugin to check if the ip address of the request is a Tor exit-node if the -user searches for ``tor-check``. It fetches the tor exit node list from -https://check.torproject.org/exit-addresses and parses all the IPs into a list, -then checks if the user's IP address is in it. - -Enable in ``settings.yml``: - -.. code:: yaml - - enabled_plugins: - .. - - 'Tor check plugin' - -""" - -import re -from flask_babel import gettext -from httpx import HTTPError -from searx.network import get - -default_on = False - -name = gettext("Tor check plugin") -'''Translated name of the plugin''' - -description = gettext( - "This plugin checks if the address of the request is a Tor exit-node, and" - " informs the user if it is; like check.torproject.org, but from SearXNG." -) -'''Translated description of the plugin.''' - -preference_section = 'query' -'''The preference section where the plugin is shown.''' - -query_keywords = ['tor-check'] -'''Query keywords shown in the preferences.''' - -query_examples = '' -'''Query examples shown in the preferences.''' - -# Regex for exit node addresses in the list. -reg = re.compile(r"(?<=ExitAddress )\S+") - - -def post_search(request, search): - - if search.search_query.pageno > 1: - return True - - if search.search_query.query.lower() == "tor-check": - - # Request the list of tor exit nodes. - try: - resp = get("https://check.torproject.org/exit-addresses") - node_list = re.findall(reg, resp.text) - - except HTTPError: - # No answer, return error - search.result_container.answers["tor"] = { - "answer": gettext( - "Could not download the list of Tor exit-nodes from: https://check.torproject.org/exit-addresses" - ) - } - return True - - x_forwarded_for = request.headers.getlist("X-Forwarded-For") - - if x_forwarded_for: - ip_address = x_forwarded_for[0] - else: - ip_address = request.remote_addr - - if ip_address in node_list: - search.result_container.answers["tor"] = { - "answer": gettext( - "You are using Tor and it looks like you have this external IP address: {ip_address}".format( - ip_address=ip_address - ) - ) - } - else: - search.result_container.answers["tor"] = { - "answer": gettext( - "You are not using Tor and you have this external IP address: {ip_address}".format( - ip_address=ip_address - ) - ) - } - - return True diff --git a/apps/searxng/searx/plugins/tracker_url_remover.py b/apps/searxng/searx/plugins/tracker_url_remover.py deleted file mode 100755 index 42c58e5..0000000 --- a/apps/searxng/searx/plugins/tracker_url_remover.py +++ /dev/null @@ -1,55 +0,0 @@ -''' -searx is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -searx is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with searx. If not, see < http://www.gnu.org/licenses/ >. - -(C) 2015 by Adam Tauber, -''' - -from flask_babel import gettext -import re -from urllib.parse import urlunparse, parse_qsl, urlencode - -regexes = { - re.compile(r'utm_[^&]+'), - re.compile(r'(wkey|wemail)[^&]*'), - re.compile(r'(_hsenc|_hsmi|hsCtaTracking|__hssc|__hstc|__hsfp)[^&]*'), - re.compile(r'&$'), -} - -name = gettext('Tracker URL remover') -description = gettext('Remove trackers arguments from the returned URL') -default_on = True -preference_section = 'privacy' - - -def on_result(request, search, result): - if 'parsed_url' not in result: - return True - - query = result['parsed_url'].query - - if query == "": - return True - parsed_query = parse_qsl(query) - - changes = 0 - for i, (param_name, _) in enumerate(list(parsed_query)): - for reg in regexes: - if reg.match(param_name): - parsed_query.pop(i - changes) - changes += 1 - result['parsed_url'] = result['parsed_url']._replace(query=urlencode(parsed_query)) - result['url'] = urlunparse(result['parsed_url']) - break - - return True diff --git a/apps/searxng/searx/plugins/vim_hotkeys.py b/apps/searxng/searx/plugins/vim_hotkeys.py deleted file mode 100755 index 3eeaf8c..0000000 --- a/apps/searxng/searx/plugins/vim_hotkeys.py +++ /dev/null @@ -1,10 +0,0 @@ -from flask_babel import gettext - -name = gettext('Vim-like hotkeys') -description = gettext( - 'Navigate search results with Vim-like hotkeys ' - '(JavaScript required). ' - 'Press "h" key on main or result page to get help.' -) -default_on = False -preference_section = 'ui' diff --git a/apps/searxng/searx/preferences.py b/apps/searxng/searx/preferences.py deleted file mode 100755 index aba7126..0000000 --- a/apps/searxng/searx/preferences.py +++ /dev/null @@ -1,591 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Searx preferences implementation. -""" - -# pylint: disable=useless-object-inheritance - -from base64 import urlsafe_b64encode, urlsafe_b64decode -from zlib import compress, decompress -from urllib.parse import parse_qs, urlencode -from typing import Iterable, Dict, List, Optional - -import flask -import babel - -from searx import settings, autocomplete -from searx.enginelib import Engine -from searx.plugins import Plugin -from searx.locales import LOCALE_NAMES -from searx.webutils import VALID_LANGUAGE_CODE -from searx.engines import DEFAULT_CATEGORY - - -COOKIE_MAX_AGE = 60 * 60 * 24 * 365 * 5 # 5 years -DOI_RESOLVERS = list(settings['doi_resolvers']) - - -class ValidationException(Exception): - - """Exption from ``cls.__init__`` when configuration value is invalid.""" - - -class Setting: - """Base class of user settings""" - - def __init__(self, default_value, locked: bool = False): - super().__init__() - self.value = default_value - self.locked = locked - - def parse(self, data: str): - """Parse ``data`` and store the result at ``self.value`` - - If needed, its overwritten in the inheritance. - """ - self.value = data - - def get_value(self): - """Returns the value of the setting - - If needed, its overwritten in the inheritance. - """ - return self.value - - def save(self, name: str, resp: flask.Response): - """Save cookie ``name`` in the HTTP response object - - If needed, its overwritten in the inheritance.""" - resp.set_cookie(name, self.value, max_age=COOKIE_MAX_AGE) - - -class StringSetting(Setting): - """Setting of plain string values""" - - -class EnumStringSetting(Setting): - """Setting of a value which can only come from the given choices""" - - def __init__(self, default_value: str, choices: Iterable[str], locked=False): - super().__init__(default_value, locked) - self.choices = choices - self._validate_selection(self.value) - - def _validate_selection(self, selection: str): - if selection not in self.choices: - raise ValidationException('Invalid value: "{0}"'.format(selection)) - - def parse(self, data: str): - """Parse and validate ``data`` and store the result at ``self.value``""" - self._validate_selection(data) - self.value = data - - -class MultipleChoiceSetting(Setting): - """Setting of values which can only come from the given choices""" - - def __init__(self, default_value: List[str], choices: Iterable[str], locked=False): - super().__init__(default_value, locked) - self.choices = choices - self._validate_selections(self.value) - - def _validate_selections(self, selections: List[str]): - for item in selections: - if item not in self.choices: - raise ValidationException('Invalid value: "{0}"'.format(selections)) - - def parse(self, data: str): - """Parse and validate ``data`` and store the result at ``self.value``""" - if data == '': - self.value = [] - return - - elements = data.split(',') - self._validate_selections(elements) - self.value = elements - - def parse_form(self, data: List[str]): - if self.locked: - return - - self.value = [] - for choice in data: - if choice in self.choices and choice not in self.value: - self.value.append(choice) - - def save(self, name: str, resp: flask.Response): - """Save cookie ``name`` in the HTTP response object""" - resp.set_cookie(name, ','.join(self.value), max_age=COOKIE_MAX_AGE) - - -class SetSetting(Setting): - """Setting of values of type ``set`` (comma separated string)""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.values = set() - - def get_value(self): - """Returns a string with comma separated values.""" - return ','.join(self.values) - - def parse(self, data: str): - """Parse and validate ``data`` and store the result at ``self.value``""" - if data == '': - self.values = set() - return - - elements = data.split(',') - for element in elements: - self.values.add(element) - - def parse_form(self, data: str): - if self.locked: - return - - elements = data.split(',') - self.values = set(elements) - - def save(self, name: str, resp: flask.Response): - """Save cookie ``name`` in the HTTP response object""" - resp.set_cookie(name, ','.join(self.values), max_age=COOKIE_MAX_AGE) - - -class SearchLanguageSetting(EnumStringSetting): - """Available choices may change, so user's value may not be in choices anymore""" - - def _validate_selection(self, selection): - if selection != '' and selection != 'auto' and not VALID_LANGUAGE_CODE.match(selection): - raise ValidationException('Invalid language code: "{0}"'.format(selection)) - - def parse(self, data: str): - """Parse and validate ``data`` and store the result at ``self.value``""" - if data not in self.choices and data != self.value: - # hack to give some backwards compatibility with old language cookies - data = str(data).replace('_', '-') - lang = data.split('-', maxsplit=1)[0] - - if data in self.choices: - pass - elif lang in self.choices: - data = lang - else: - data = self.value - self._validate_selection(data) - self.value = data - - -class MapSetting(Setting): - """Setting of a value that has to be translated in order to be storable""" - - def __init__(self, default_value, map: Dict[str, object], locked=False): # pylint: disable=redefined-builtin - super().__init__(default_value, locked) - self.map = map - - if self.value not in self.map.values(): - raise ValidationException('Invalid default value') - - def parse(self, data: str): - """Parse and validate ``data`` and store the result at ``self.value``""" - - if data not in self.map: - raise ValidationException('Invalid choice: {0}'.format(data)) - self.value = self.map[data] - self.key = data # pylint: disable=attribute-defined-outside-init - - def save(self, name: str, resp: flask.Response): - """Save cookie ``name`` in the HTTP response object""" - if hasattr(self, 'key'): - resp.set_cookie(name, self.key, max_age=COOKIE_MAX_AGE) - - -class BooleanChoices: - """Maps strings to booleans that are either true or false.""" - - def __init__(self, name: str, choices: Dict[str, bool], locked: bool = False): - self.name = name - self.choices = choices - self.locked = locked - self.default_choices = dict(choices) - - def transform_form_items(self, items): - return items - - def transform_values(self, values): - return values - - def parse_cookie(self, data_disabled: str, data_enabled: str): - for disabled in data_disabled.split(','): - if disabled in self.choices: - self.choices[disabled] = False - - for enabled in data_enabled.split(','): - if enabled in self.choices: - self.choices[enabled] = True - - def parse_form(self, items: List[str]): - if self.locked: - return - - disabled = self.transform_form_items(items) - for setting in self.choices: - self.choices[setting] = setting not in disabled - - @property - def enabled(self): - return (k for k, v in self.choices.items() if v) - - @property - def disabled(self): - return (k for k, v in self.choices.items() if not v) - - def save(self, resp: flask.Response): - """Save cookie in the HTTP response object""" - disabled_changed = (k for k in self.disabled if self.default_choices[k]) - enabled_changed = (k for k in self.enabled if not self.default_choices[k]) - resp.set_cookie('disabled_{0}'.format(self.name), ','.join(disabled_changed), max_age=COOKIE_MAX_AGE) - resp.set_cookie('enabled_{0}'.format(self.name), ','.join(enabled_changed), max_age=COOKIE_MAX_AGE) - - def get_disabled(self): - return self.transform_values(list(self.disabled)) - - def get_enabled(self): - return self.transform_values(list(self.enabled)) - - -class EnginesSetting(BooleanChoices): - """Engine settings""" - - def __init__(self, default_value, engines: Iterable[Engine]): - choices = {} - for engine in engines: - for category in engine.categories: - if not category in list(settings['categories_as_tabs'].keys()) + [DEFAULT_CATEGORY]: - continue - choices['{}__{}'.format(engine.name, category)] = not engine.disabled - super().__init__(default_value, choices) - - def transform_form_items(self, items): - return [item[len('engine_') :].replace('_', ' ').replace(' ', '__') for item in items] - - def transform_values(self, values): - if len(values) == 1 and next(iter(values)) == '': - return [] - transformed_values = [] - for value in values: - engine, category = value.split('__') - transformed_values.append((engine, category)) - return transformed_values - - -class PluginsSetting(BooleanChoices): - """Plugin settings""" - - def __init__(self, default_value, plugins: Iterable[Plugin]): - super().__init__(default_value, {plugin.id: plugin.default_on for plugin in plugins}) - - def transform_form_items(self, items): - return [item[len('plugin_') :] for item in items] - - -class ClientPref: - """Container to assemble client prefferences and settings.""" - - # hint: searx.webapp.get_client_settings should be moved into this class - - locale: babel.Locale - """Locale prefered by the client.""" - - def __init__(self, locale: Optional[babel.Locale] = None): - self.locale = locale - - @property - def locale_tag(self): - if self.locale is None: - return None - tag = self.locale.language - if self.locale.territory: - tag += '-' + self.locale.territory - return tag - - @classmethod - def from_http_request(cls, http_request: flask.Request): - """Build ClientPref object from HTTP request. - - - `Accept-Language used for locale setting - `__ - - """ - al_header = http_request.headers.get("Accept-Language") - if not al_header: - return cls(locale=None) - - pairs = [] - for l in al_header.split(','): - # fmt: off - lang, qvalue = [_.strip() for _ in (l.split(';') + ['q=1',])[:2]] - # fmt: on - try: - qvalue = float(qvalue.split('=')[-1]) - locale = babel.Locale.parse(lang, sep='-') - except (ValueError, babel.core.UnknownLocaleError): - continue - pairs.append((locale, qvalue)) - - locale = None - if pairs: - pairs.sort(reverse=True, key=lambda x: x[1]) - locale = pairs[0][0] - return cls(locale=locale) - - -class Preferences: - """Validates and saves preferences to cookies""" - - def __init__( - self, - themes: List[str], - categories: List[str], - engines: Dict[str, Engine], - plugins: Iterable[Plugin], - client: Optional[ClientPref] = None, - ): - - super().__init__() - - self.key_value_settings: Dict[str, Setting] = { - # fmt: off - 'categories': MultipleChoiceSetting( - ['general'], - locked=is_locked('categories'), - choices=categories + ['none'] - ), - 'language': SearchLanguageSetting( - settings['search']['default_lang'], - locked=is_locked('language'), - choices=settings['search']['languages'] + [''] - ), - 'locale': EnumStringSetting( - settings['ui']['default_locale'], - locked=is_locked('locale'), - choices=list(LOCALE_NAMES.keys()) + [''] - ), - 'autocomplete': EnumStringSetting( - settings['search']['autocomplete'], - locked=is_locked('autocomplete'), - choices=list(autocomplete.backends.keys()) + [''] - ), - 'image_proxy': MapSetting( - settings['server']['image_proxy'], - locked=is_locked('image_proxy'), - map={ - '': settings['server']['image_proxy'], - '0': False, - '1': True, - 'True': True, - 'False': False - } - ), - 'method': EnumStringSetting( - settings['server']['method'], - locked=is_locked('method'), - choices=('GET', 'POST') - ), - 'safesearch': MapSetting( - settings['search']['safe_search'], - locked=is_locked('safesearch'), - map={ - '0': 0, - '1': 1, - '2': 2 - } - ), - 'theme': EnumStringSetting( - settings['ui']['default_theme'], - locked=is_locked('theme'), - choices=themes - ), - 'results_on_new_tab': MapSetting( - settings['ui']['results_on_new_tab'], - locked=is_locked('results_on_new_tab'), - map={ - '0': False, - '1': True, - 'False': False, - 'True': True - } - ), - 'doi_resolver': MultipleChoiceSetting( - [settings['default_doi_resolver'], ], - locked=is_locked('doi_resolver'), - choices=DOI_RESOLVERS - ), - 'simple_style': EnumStringSetting( - settings['ui']['theme_args']['simple_style'], - locked=is_locked('simple_style'), - choices=['', 'auto', 'light', 'dark'] - ), - 'center_alignment': MapSetting( - settings['ui']['center_alignment'], - locked=is_locked('center_alignment'), - map={ - '0': False, - '1': True, - 'False': False, - 'True': True - } - ), - 'advanced_search': MapSetting( - settings['ui']['advanced_search'], - locked=is_locked('advanced_search'), - map={ - '0': False, - '1': True, - 'False': False, - 'True': True, - 'on': True, - } - ), - 'query_in_title': MapSetting( - settings['ui']['query_in_title'], - locked=is_locked('query_in_title'), - map={ - '': settings['ui']['query_in_title'], - '0': False, - '1': True, - 'True': True, - 'False': False - } - ), - 'infinite_scroll': MapSetting( - settings['ui']['infinite_scroll'], - locked=is_locked('infinite_scroll'), - map={ - '': settings['ui']['infinite_scroll'], - '0': False, - '1': True, - 'True': True, - 'False': False - } - ), - # fmt: on - } - - self.engines = EnginesSetting('engines', engines=engines.values()) - self.plugins = PluginsSetting('plugins', plugins=plugins) - self.tokens = SetSetting('tokens') - self.client = client or ClientPref() - self.unknown_params: Dict[str, str] = {} - - def get_as_url_params(self): - """Return preferences as URL parameters""" - settings_kv = {} - for k, v in self.key_value_settings.items(): - if v.locked: - continue - if isinstance(v, MultipleChoiceSetting): - settings_kv[k] = ','.join(v.get_value()) - else: - settings_kv[k] = v.get_value() - - settings_kv['disabled_engines'] = ','.join(self.engines.disabled) - settings_kv['enabled_engines'] = ','.join(self.engines.enabled) - - settings_kv['disabled_plugins'] = ','.join(self.plugins.disabled) - settings_kv['enabled_plugins'] = ','.join(self.plugins.enabled) - - settings_kv['tokens'] = ','.join(self.tokens.values) - - return urlsafe_b64encode(compress(urlencode(settings_kv).encode())).decode() - - def parse_encoded_data(self, input_data: str): - """parse (base64) preferences from request (``flask.request.form['preferences']``)""" - bin_data = decompress(urlsafe_b64decode(input_data)) - dict_data = {} - for x, y in parse_qs(bin_data.decode('ascii'), keep_blank_values=True).items(): - dict_data[x] = y[0] - self.parse_dict(dict_data) - - def parse_dict(self, input_data: Dict[str, str]): - """parse preferences from request (``flask.request.form``)""" - for user_setting_name, user_setting in input_data.items(): - if user_setting_name in self.key_value_settings: - if self.key_value_settings[user_setting_name].locked: - continue - self.key_value_settings[user_setting_name].parse(user_setting) - elif user_setting_name == 'disabled_engines': - self.engines.parse_cookie(input_data.get('disabled_engines', ''), input_data.get('enabled_engines', '')) - elif user_setting_name == 'disabled_plugins': - self.plugins.parse_cookie(input_data.get('disabled_plugins', ''), input_data.get('enabled_plugins', '')) - elif user_setting_name == 'tokens': - self.tokens.parse(user_setting) - elif not any( - user_setting_name.startswith(x) for x in ['enabled_', 'disabled_', 'engine_', 'category_', 'plugin_'] - ): - self.unknown_params[user_setting_name] = user_setting - - def parse_form(self, input_data: Dict[str, str]): - """Parse formular (````) data from a ``flask.request.form``""" - disabled_engines = [] - enabled_categories = [] - disabled_plugins = [] - for user_setting_name, user_setting in input_data.items(): - if user_setting_name in self.key_value_settings: - self.key_value_settings[user_setting_name].parse(user_setting) - elif user_setting_name.startswith('engine_'): - disabled_engines.append(user_setting_name) - elif user_setting_name.startswith('category_'): - enabled_categories.append(user_setting_name[len('category_') :]) - elif user_setting_name.startswith('plugin_'): - disabled_plugins.append(user_setting_name) - elif user_setting_name == 'tokens': - self.tokens.parse_form(user_setting) - else: - self.unknown_params[user_setting_name] = user_setting - self.key_value_settings['categories'].parse_form(enabled_categories) - self.engines.parse_form(disabled_engines) - self.plugins.parse_form(disabled_plugins) - - # cannot be used in case of engines or plugins - def get_value(self, user_setting_name: str): - """Returns the value for ``user_setting_name``""" - ret_val = None - if user_setting_name in self.key_value_settings: - ret_val = self.key_value_settings[user_setting_name].get_value() - if user_setting_name in self.unknown_params: - ret_val = self.unknown_params[user_setting_name] - return ret_val - - def save(self, resp: flask.Response): - """Save cookie in the HTTP response object""" - for user_setting_name, user_setting in self.key_value_settings.items(): - # pylint: disable=unnecessary-dict-index-lookup - if self.key_value_settings[user_setting_name].locked: - continue - user_setting.save(user_setting_name, resp) - self.engines.save(resp) - self.plugins.save(resp) - self.tokens.save('tokens', resp) - for k, v in self.unknown_params.items(): - resp.set_cookie(k, v, max_age=COOKIE_MAX_AGE) - return resp - - def validate_token(self, engine): - valid = True - if hasattr(engine, 'tokens') and engine.tokens: - valid = False - for token in self.tokens.values: - if token in engine.tokens: - valid = True - break - - return valid - - -def is_locked(setting_name: str): - """Checks if a given setting name is locked by settings.yml""" - if 'preferences' not in settings: - return False - if 'lock' not in settings['preferences']: - return False - return setting_name in settings['preferences']['lock'] diff --git a/apps/searxng/searx/query.py b/apps/searxng/searx/query.py deleted file mode 100755 index 751308b..0000000 --- a/apps/searxng/searx/query.py +++ /dev/null @@ -1,334 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -from abc import abstractmethod, ABC -import re - -from searx import settings -from searx.sxng_locales import sxng_locales -from searx.engines import categories, engines, engine_shortcuts -from searx.external_bang import get_bang_definition_and_autocomplete -from searx.search import EngineRef -from searx.webutils import VALID_LANGUAGE_CODE - - -class QueryPartParser(ABC): - - __slots__ = "raw_text_query", "enable_autocomplete" - - @staticmethod - @abstractmethod - def check(raw_value): - """Check if raw_value can be parsed""" - - def __init__(self, raw_text_query, enable_autocomplete): - self.raw_text_query = raw_text_query - self.enable_autocomplete = enable_autocomplete - - @abstractmethod - def __call__(self, raw_value): - """Try to parse raw_value: set the self.raw_text_query properties - - return True if raw_value has been parsed - - self.raw_text_query.autocomplete_list is also modified - if self.enable_autocomplete is True - """ - - def _add_autocomplete(self, value): - if value not in self.raw_text_query.autocomplete_list: - self.raw_text_query.autocomplete_list.append(value) - - -class TimeoutParser(QueryPartParser): - @staticmethod - def check(raw_value): - return raw_value[0] == '<' - - def __call__(self, raw_value): - value = raw_value[1:] - found = self._parse(value) if len(value) > 0 else False - if self.enable_autocomplete and not value: - self._autocomplete() - return found - - def _parse(self, value): - if not value.isdigit(): - return False - raw_timeout_limit = int(value) - if raw_timeout_limit < 100: - # below 100, the unit is the second ( <3 = 3 seconds timeout ) - self.raw_text_query.timeout_limit = float(raw_timeout_limit) - else: - # 100 or above, the unit is the millisecond ( <850 = 850 milliseconds timeout ) - self.raw_text_query.timeout_limit = raw_timeout_limit / 1000.0 - return True - - def _autocomplete(self): - for suggestion in ['<3', '<850']: - self._add_autocomplete(suggestion) - - -class LanguageParser(QueryPartParser): - @staticmethod - def check(raw_value): - return raw_value[0] == ':' - - def __call__(self, raw_value): - value = raw_value[1:].lower().replace('_', '-') - found = self._parse(value) if len(value) > 0 else False - if self.enable_autocomplete and not found: - self._autocomplete(value) - return found - - def _parse(self, value): - found = False - # check if any language-code is equal with - # declared language-codes - for lc in sxng_locales: - lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) - - # if correct language-code is found - # set it as new search-language - - if ( - value == lang_id or value == lang_name or value == english_name or value.replace('-', ' ') == country - ) and value not in self.raw_text_query.languages: - found = True - lang_parts = lang_id.split('-') - if len(lang_parts) == 2: - self.raw_text_query.languages.append(lang_parts[0] + '-' + lang_parts[1].upper()) - else: - self.raw_text_query.languages.append(lang_id) - # to ensure best match (first match is not necessarily the best one) - if value == lang_id: - break - - # user may set a valid, yet not selectable language - if VALID_LANGUAGE_CODE.match(value) or value == 'auto': - lang_parts = value.split('-') - if len(lang_parts) > 1: - value = lang_parts[0].lower() + '-' + lang_parts[1].upper() - if value not in self.raw_text_query.languages: - self.raw_text_query.languages.append(value) - found = True - - return found - - def _autocomplete(self, value): - if not value: - # show some example queries - if len(settings['search']['languages']) < 10: - for lang in settings['search']['languages']: - self.raw_text_query.autocomplete_list.append(':' + lang) - else: - for lang in [":en", ":en_us", ":english", ":united_kingdom"]: - self.raw_text_query.autocomplete_list.append(lang) - return - - for lc in sxng_locales: - if lc[0] not in settings['search']['languages']: - continue - lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) - - # check if query starts with language-id - if lang_id.startswith(value): - if len(value) <= 2: - self._add_autocomplete(':' + lang_id.split('-')[0]) - else: - self._add_autocomplete(':' + lang_id) - - # check if query starts with language name - if lang_name.startswith(value) or english_name.startswith(value): - self._add_autocomplete(':' + lang_name) - - # check if query starts with country - # here "new_zealand" is "new-zealand" (see __call__) - if country.startswith(value.replace('-', ' ')): - self._add_autocomplete(':' + country.replace(' ', '_')) - - -class ExternalBangParser(QueryPartParser): - @staticmethod - def check(raw_value): - return raw_value.startswith('!!') - - def __call__(self, raw_value): - value = raw_value[2:] - found, bang_ac_list = self._parse(value) if len(value) > 0 else (False, []) - if self.enable_autocomplete: - self._autocomplete(bang_ac_list) - return found - - def _parse(self, value): - found = False - bang_definition, bang_ac_list = get_bang_definition_and_autocomplete(value) - if bang_definition is not None: - self.raw_text_query.external_bang = value - found = True - return found, bang_ac_list - - def _autocomplete(self, bang_ac_list): - if not bang_ac_list: - bang_ac_list = ['g', 'ddg', 'bing'] - for external_bang in bang_ac_list: - self._add_autocomplete('!!' + external_bang) - - -class BangParser(QueryPartParser): - @staticmethod - def check(raw_value): - return raw_value[0] == '!' - - def __call__(self, raw_value): - value = raw_value[1:].replace('-', ' ').replace('_', ' ') - found = self._parse(value) if len(value) > 0 else False - if found and raw_value[0] == '!': - self.raw_text_query.specific = True - if self.enable_autocomplete: - self._autocomplete(raw_value[0], value) - return found - - def _parse(self, value): - # check if prefix is equal with engine shortcut - if value in engine_shortcuts: - value = engine_shortcuts[value] - - # check if prefix is equal with engine name - if value in engines: - self.raw_text_query.enginerefs.append(EngineRef(value, 'none')) - return True - - # check if prefix is equal with category name - if value in categories: - # using all engines for that search, which - # are declared under that category name - self.raw_text_query.enginerefs.extend( - EngineRef(engine.name, value) - for engine in categories[value] - if (engine.name, value) not in self.raw_text_query.disabled_engines - ) - return True - - return False - - def _autocomplete(self, first_char, value): - if not value: - # show some example queries - for suggestion in ['images', 'wikipedia', 'osm']: - if suggestion not in self.raw_text_query.disabled_engines or suggestion in categories: - self._add_autocomplete(first_char + suggestion) - return - - # check if query starts with category name - for category in categories: - if category.startswith(value): - self._add_autocomplete(first_char + category.replace(' ', '_')) - - # check if query starts with engine name - for engine in engines: - if engine.startswith(value): - self._add_autocomplete(first_char + engine.replace(' ', '_')) - - # check if query starts with engine shortcut - for engine_shortcut in engine_shortcuts: - if engine_shortcut.startswith(value): - self._add_autocomplete(first_char + engine_shortcut) - - -class RawTextQuery: - """parse raw text query (the value from the html input)""" - - PARSER_CLASSES = [ - TimeoutParser, # this force the timeout - LanguageParser, # this force a language - ExternalBangParser, # external bang (must be before BangParser) - BangParser, # this force a engine or category - ] - - def __init__(self, query, disabled_engines): - assert isinstance(query, str) - # input parameters - self.query = query - self.disabled_engines = disabled_engines if disabled_engines else [] - # parsed values - self.enginerefs = [] - self.languages = [] - self.timeout_limit = None - self.external_bang = None - self.specific = False - self.autocomplete_list = [] - # internal properties - self.query_parts = [] # use self.getFullQuery() - self.user_query_parts = [] # use self.getQuery() - self.autocomplete_location = None - self._parse_query() - - def _parse_query(self): - """ - parse self.query, if tags are set, which - change the search engine or search-language - """ - - # split query, including whitespaces - raw_query_parts = re.split(r'(\s+)', self.query) - - last_index_location = None - autocomplete_index = len(raw_query_parts) - 1 - - for i, query_part in enumerate(raw_query_parts): - # part does only contain spaces, skip - if query_part.isspace() or query_part == '': - continue - - # parse special commands - special_part = False - for parser_class in RawTextQuery.PARSER_CLASSES: - if parser_class.check(query_part): - special_part = parser_class(self, i == autocomplete_index)(query_part) - break - - # append query part to query_part list - qlist = self.query_parts if special_part else self.user_query_parts - qlist.append(query_part) - last_index_location = (qlist, len(qlist) - 1) - - self.autocomplete_location = last_index_location - - def get_autocomplete_full_query(self, text): - qlist, position = self.autocomplete_location - qlist[position] = text - return self.getFullQuery() - - def changeQuery(self, query): - self.user_query_parts = query.strip().split() - self.query = self.getFullQuery() - self.autocomplete_location = (self.user_query_parts, len(self.user_query_parts) - 1) - self.autocomplete_list = [] - return self - - def getQuery(self): - return ' '.join(self.user_query_parts) - - def getFullQuery(self): - """ - get full query including whitespaces - """ - return '{0} {1}'.format(' '.join(self.query_parts), self.getQuery()).strip() - - def __str__(self): - return self.getFullQuery() - - def __repr__(self): - return ( - f"<{self.__class__.__name__} " - + f"query={self.query!r} " - + f"disabled_engines={self.disabled_engines!r}\n " - + f"languages={self.languages!r} " - + f"timeout_limit={self.timeout_limit!r} " - + f"external_bang={self.external_bang!r} " - + f"specific={self.specific!r} " - + f"enginerefs={self.enginerefs!r}\n " - + f"autocomplete_list={self.autocomplete_list!r}\n " - + f"query_parts={self.query_parts!r}\n " - + f"user_query_parts={self.user_query_parts!r} >" - ) diff --git a/apps/searxng/searx/redisdb.py b/apps/searxng/searx/redisdb.py deleted file mode 100755 index 0544d69..0000000 --- a/apps/searxng/searx/redisdb.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Implementation of the redis client (redis-py_). - -.. _redis-py: https://github.com/redis/redis-py - -This implementation uses the :ref:`settings redis` setup from ``settings.yml``. -A redis DB connect can be tested by:: - - >>> from searx import redisdb - >>> redisdb.initialize() - True - >>> db = redisdb.client() - >>> db.set("foo", "bar") - True - >>> db.get("foo") - b'bar' - >>> - -""" - -import os -import pwd -import logging -import redis -from searx import get_setting - - -OLD_REDIS_URL_DEFAULT_URL = 'unix:///usr/local/searxng-redis/run/redis.sock?db=0' -"""This was the default Redis URL in settings.yml.""" - -_CLIENT = None -logger = logging.getLogger(__name__) - - -def client() -> redis.Redis: - return _CLIENT - - -def initialize(): - global _CLIENT # pylint: disable=global-statement - redis_url = get_setting('redis.url') - if not redis_url: - return False - try: - # create a client, but no connection is done - _CLIENT = redis.Redis.from_url(redis_url) - - # log the parameters as seen by the redis lib, without the password - kwargs = _CLIENT.get_connection_kwargs().copy() - kwargs.pop('password', None) - kwargs = ' '.join([f'{k}={v!r}' for k, v in kwargs.items()]) - logger.info("connecting to Redis %s", kwargs) - - # check the connection - _CLIENT.ping() - - # no error: the redis connection is working - logger.info("connected to Redis") - return True - except redis.exceptions.RedisError as e: - _CLIENT = None - _pw = pwd.getpwuid(os.getuid()) - logger.exception("[%s (%s)] can't connect redis DB ...", _pw.pw_name, _pw.pw_uid) - if redis_url == OLD_REDIS_URL_DEFAULT_URL and isinstance(e, redis.exceptions.ConnectionError): - logger.info( - "You can safely ignore the above Redis error if you don't use Redis. " - "You can remove this error by setting redis.url to false in your settings.yml." - ) - return False diff --git a/apps/searxng/searx/redislib.py b/apps/searxng/searx/redislib.py deleted file mode 100755 index a90e15b..0000000 --- a/apps/searxng/searx/redislib.py +++ /dev/null @@ -1,241 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""A collection of convenient functions and redis/lua scripts. - -This code was partial inspired by the `Bullet-Proofing Lua Scripts in RedisPy`_ -article. - -.. _Bullet-Proofing Lua Scripts in RedisPy: - https://redis.com/blog/bullet-proofing-lua-scripts-in-redispy/ - -""" - -import hmac - -from searx import get_setting - -LUA_SCRIPT_STORAGE = {} -"""A global dictionary to cache client's ``Script`` objects, used by -:py:obj:`lua_script_storage`""" - - -def lua_script_storage(client, script): - """Returns a redis :py:obj:`Script - ` instance. - - Due to performance reason the ``Script`` object is instantiated only once - for a client (``client.register_script(..)``) and is cached in - :py:obj:`LUA_SCRIPT_STORAGE`. - - """ - - # redis connection can be closed, lets use the id() of the redis connector - # as key in the script-storage: - client_id = id(client) - - if LUA_SCRIPT_STORAGE.get(client_id) is None: - LUA_SCRIPT_STORAGE[client_id] = {} - - if LUA_SCRIPT_STORAGE[client_id].get(script) is None: - LUA_SCRIPT_STORAGE[client_id][script] = client.register_script(script) - - return LUA_SCRIPT_STORAGE[client_id][script] - - -PURGE_BY_PREFIX = """ -local prefix = tostring(ARGV[1]) -for i, name in ipairs(redis.call('KEYS', prefix .. '*')) do - redis.call('EXPIRE', name, 0) -end -""" - - -def purge_by_prefix(client, prefix: str = "SearXNG_"): - """Purge all keys with ``prefix`` from database. - - Queries all keys in the database by the given prefix and set expire time to - zero. The default prefix will drop all keys which has been set by SearXNG - (drops SearXNG schema entirely from database). - - The implementation is the lua script from string :py:obj:`PURGE_BY_PREFIX`. - The lua script uses EXPIRE_ instead of DEL_: if there are a lot keys to - delete and/or their values are big, `DEL` could take more time and blocks - the command loop while `EXPIRE` turns back immediate. - - :param prefix: prefix of the key to delete (default: ``SearXNG_``) - :type name: str - - .. _EXPIRE: https://redis.io/commands/expire/ - .. _DEL: https://redis.io/commands/del/ - - """ - script = lua_script_storage(client, PURGE_BY_PREFIX) - script(args=[prefix]) - - -def secret_hash(name: str): - """Creates a hash of the ``name``. - - Combines argument ``name`` with the ``secret_key`` from :ref:`settings - server`. This function can be used to get a more anonymised name of a Redis - KEY. - - :param name: the name to create a secret hash for - :type name: str - """ - m = hmac.new(bytes(name, encoding='utf-8'), digestmod='sha256') - m.update(bytes(get_setting('server.secret_key'), encoding='utf-8')) - return m.hexdigest() - - -INCR_COUNTER = """ -local limit = tonumber(ARGV[1]) -local expire = tonumber(ARGV[2]) -local c_name = KEYS[1] - -local c = redis.call('GET', c_name) - -if not c then - c = redis.call('INCR', c_name) - if expire > 0 then - redis.call('EXPIRE', c_name, expire) - end -else - c = tonumber(c) - if limit == 0 or c < limit then - c = redis.call('INCR', c_name) - end -end -return c -""" - - -def incr_counter(client, name: str, limit: int = 0, expire: int = 0): - """Increment a counter and return the new value. - - If counter with redis key ``SearXNG_counter_`` does not exists it is - created with initial value 1 returned. The replacement ```` is a - *secret hash* of the value from argument ``name`` (see - :py:func:`secret_hash`). - - The implementation of the redis counter is the lua script from string - :py:obj:`INCR_COUNTER`. - - :param name: name of the counter - :type name: str - - :param expire: live-time of the counter in seconds (default ``None`` means - infinite). - :type expire: int / see EXPIRE_ - - :param limit: limit where the counter stops to increment (default ``None``) - :type limit: int / limit is 2^64 see INCR_ - - :return: value of the incremented counter - :type return: int - - .. _EXPIRE: https://redis.io/commands/expire/ - .. _INCR: https://redis.io/commands/incr/ - - A simple demo of a counter with expire time and limit:: - - >>> for i in range(6): - ... i, incr_counter(client, "foo", 3, 5) # max 3, duration 5 sec - ... time.sleep(1) # from the third call on max has been reached - ... - (0, 1) - (1, 2) - (2, 3) - (3, 3) - (4, 3) - (5, 1) - - """ - script = lua_script_storage(client, INCR_COUNTER) - name = "SearXNG_counter_" + secret_hash(name) - c = script(args=[limit, expire], keys=[name]) - return c - - -def drop_counter(client, name): - """Drop counter with redis key ``SearXNG_counter_`` - - The replacement ```` is a *secret hash* of the value from argument - ``name`` (see :py:func:`incr_counter` and :py:func:`incr_sliding_window`). - """ - name = "SearXNG_counter_" + secret_hash(name) - client.delete(name) - - -INCR_SLIDING_WINDOW = """ -local expire = tonumber(ARGV[1]) -local name = KEYS[1] -local current_time = redis.call('TIME') - -redis.call('ZREMRANGEBYSCORE', name, 0, current_time[1] - expire) -redis.call('ZADD', name, current_time[1], current_time[1] .. current_time[2]) -local result = redis.call('ZCOUNT', name, 0, current_time[1] + 1) -redis.call('EXPIRE', name, expire) -return result -""" - - -def incr_sliding_window(client, name: str, duration: int): - """Increment a sliding-window counter and return the new value. - - If counter with redis key ``SearXNG_counter_`` does not exists it is - created with initial value 1 returned. The replacement ```` is a - *secret hash* of the value from argument ``name`` (see - :py:func:`secret_hash`). - - :param name: name of the counter - :type name: str - - :param duration: live-time of the sliding window in seconds - :typeduration: int - - :return: value of the incremented counter - :type return: int - - The implementation of the redis counter is the lua script from string - :py:obj:`INCR_SLIDING_WINDOW`. The lua script uses `sorted sets in Redis`_ - to implement a sliding window for the redis key ``SearXNG_counter_`` - (ZADD_). The current TIME_ is used to score the items in the sorted set and - the time window is moved by removing items with a score lower current time - minus *duration* time (ZREMRANGEBYSCORE_). - - The EXPIRE_ time (the duration of the sliding window) is refreshed on each - call (incrementation) and if there is no call in this duration, the sorted - set expires from the redis DB. - - The return value is the amount of items in the sorted set (ZCOUNT_), what - means the number of calls in the sliding window. - - .. _Sorted sets in Redis: - https://redis.com/ebook/part-1-getting-started/chapter-1-getting-to-know-redis/1-2-what-redis-data-structures-look-like/1-2-5-sorted-sets-in-redis/ - .. _TIME: https://redis.io/commands/time/ - .. _ZADD: https://redis.io/commands/zadd/ - .. _EXPIRE: https://redis.io/commands/expire/ - .. _ZREMRANGEBYSCORE: https://redis.io/commands/zremrangebyscore/ - .. _ZCOUNT: https://redis.io/commands/zcount/ - - A simple demo of the sliding window:: - - >>> for i in range(5): - ... incr_sliding_window(client, "foo", 3) # duration 3 sec - ... time.sleep(1) # from the third call (second) on the window is moved - ... - 1 - 2 - 3 - 3 - 3 - >>> time.sleep(3) # wait until expire - >>> incr_sliding_window(client, "foo", 3) - 1 - - """ - script = lua_script_storage(client, INCR_SLIDING_WINDOW) - name = "SearXNG_counter_" + secret_hash(name) - c = script(args=[duration], keys=[name]) - return c diff --git a/apps/searxng/searx/results.py b/apps/searxng/searx/results.py deleted file mode 100755 index caf0221..0000000 --- a/apps/searxng/searx/results.py +++ /dev/null @@ -1,445 +0,0 @@ -import re -from collections import defaultdict -from operator import itemgetter -from threading import RLock -from typing import List, NamedTuple, Set -from urllib.parse import urlparse, unquote - -from searx import logger -from searx import utils -from searx.engines import engines -from searx.metrics import histogram_observe, counter_add, count_error - - -CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U) -WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U) - - -# return the meaningful length of the content for a result -def result_content_len(content): - if isinstance(content, str): - return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content)) - else: - return 0 - - -def compare_urls(url_a, url_b): - """Lazy compare between two URL. - "www.example.com" and "example.com" are equals. - "www.example.com/path/" and "www.example.com/path" are equals. - "https://www.example.com/" and "http://www.example.com/" are equals. - - Args: - url_a (ParseResult): first URL - url_b (ParseResult): second URL - - Returns: - bool: True if url_a and url_b are equals - """ - # ignore www. in comparison - if url_a.netloc.startswith('www.'): - host_a = url_a.netloc.replace('www.', '', 1) - else: - host_a = url_a.netloc - if url_b.netloc.startswith('www.'): - host_b = url_b.netloc.replace('www.', '', 1) - else: - host_b = url_b.netloc - - if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment: - return False - - # remove / from the end of the url if required - path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path - path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path - - return unquote(path_a) == unquote(path_b) - - -def merge_two_infoboxes(infobox1, infobox2): - # get engines weights - if hasattr(engines[infobox1['engine']], 'weight'): - weight1 = engines[infobox1['engine']].weight - else: - weight1 = 1 - if hasattr(engines[infobox2['engine']], 'weight'): - weight2 = engines[infobox2['engine']].weight - else: - weight2 = 1 - - if weight2 > weight1: - infobox1['engine'] = infobox2['engine'] - - infobox1['engines'] |= infobox2['engines'] - - if 'urls' in infobox2: - urls1 = infobox1.get('urls', None) - if urls1 is None: - urls1 = [] - - for url2 in infobox2.get('urls', []): - unique_url = True - parsed_url2 = urlparse(url2.get('url', '')) - entity_url2 = url2.get('entity') - for url1 in urls1: - if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls( - urlparse(url1.get('url', '')), parsed_url2 - ): - unique_url = False - break - if unique_url: - urls1.append(url2) - - infobox1['urls'] = urls1 - - if 'img_src' in infobox2: - img1 = infobox1.get('img_src', None) - img2 = infobox2.get('img_src') - if img1 is None: - infobox1['img_src'] = img2 - elif weight2 > weight1: - infobox1['img_src'] = img2 - - if 'attributes' in infobox2: - attributes1 = infobox1.get('attributes') - if attributes1 is None: - infobox1['attributes'] = attributes1 = [] - - attributeSet = set() - for attribute in attributes1: - label = attribute.get('label') - if label not in attributeSet: - attributeSet.add(label) - entity = attribute.get('entity') - if entity not in attributeSet: - attributeSet.add(entity) - - for attribute in infobox2.get('attributes', []): - if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet: - attributes1.append(attribute) - - if 'content' in infobox2: - content1 = infobox1.get('content', None) - content2 = infobox2.get('content', '') - if content1 is not None: - if result_content_len(content2) > result_content_len(content1): - infobox1['content'] = content2 - else: - infobox1['content'] = content2 - - -def result_score(result): - weight = 1.0 - - for result_engine in result['engines']: - if hasattr(engines[result_engine], 'weight'): - weight *= float(engines[result_engine].weight) - - occurrences = len(result['positions']) - - return sum((occurrences * weight) / position for position in result['positions']) - - -class Timing(NamedTuple): - engine: str - total: float - load: float - - -class UnresponsiveEngine(NamedTuple): - engine: str - error_type: str - suspended: bool - - -class ResultContainer: - """docstring for ResultContainer""" - - __slots__ = ( - '_merged_results', - 'infoboxes', - 'suggestions', - 'answers', - 'corrections', - '_number_of_results', - '_closed', - 'paging', - 'unresponsive_engines', - 'timings', - 'redirect_url', - 'engine_data', - 'on_result', - '_lock', - ) - - def __init__(self): - super().__init__() - self._merged_results = [] - self.infoboxes = [] - self.suggestions = set() - self.answers = {} - self.corrections = set() - self._number_of_results = [] - self.engine_data = defaultdict(dict) - self._closed = False - self.paging = False - self.unresponsive_engines: Set[UnresponsiveEngine] = set() - self.timings: List[Timing] = [] - self.redirect_url = None - self.on_result = lambda _: True - self._lock = RLock() - - def extend(self, engine_name, results): - if self._closed: - return - - standard_result_count = 0 - error_msgs = set() - for result in list(results): - result['engine'] = engine_name - if 'suggestion' in result and self.on_result(result): - self.suggestions.add(result['suggestion']) - elif 'answer' in result and self.on_result(result): - self.answers[result['answer']] = result - elif 'correction' in result and self.on_result(result): - self.corrections.add(result['correction']) - elif 'infobox' in result and self.on_result(result): - self._merge_infobox(result) - elif 'number_of_results' in result and self.on_result(result): - self._number_of_results.append(result['number_of_results']) - elif 'engine_data' in result and self.on_result(result): - self.engine_data[engine_name][result['key']] = result['engine_data'] - elif 'url' in result: - # standard result (url, title, content) - if not self._is_valid_url_result(result, error_msgs): - continue - # normalize the result - self._normalize_url_result(result) - # call on_result call searx.search.SearchWithPlugins._on_result - # which calls the plugins - if not self.on_result(result): - continue - self.__merge_url_result(result, standard_result_count + 1) - standard_result_count += 1 - elif self.on_result(result): - self.__merge_result_no_url(result, standard_result_count + 1) - standard_result_count += 1 - - if len(error_msgs) > 0: - for msg in error_msgs: - count_error(engine_name, 'some results are invalids: ' + msg, secondary=True) - - if engine_name in engines: - histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count') - - if not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging: - self.paging = True - - def _merge_infobox(self, infobox): - add_infobox = True - infobox_id = infobox.get('id', None) - infobox['engines'] = set([infobox['engine']]) - if infobox_id is not None: - parsed_url_infobox_id = urlparse(infobox_id) - with self._lock: - for existingIndex in self.infoboxes: - if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id): - merge_two_infoboxes(existingIndex, infobox) - add_infobox = False - - if add_infobox: - self.infoboxes.append(infobox) - - def _is_valid_url_result(self, result, error_msgs): - if 'url' in result: - if not isinstance(result['url'], str): - logger.debug('result: invalid URL: %s', str(result)) - error_msgs.add('invalid URL') - return False - - if 'title' in result and not isinstance(result['title'], str): - logger.debug('result: invalid title: %s', str(result)) - error_msgs.add('invalid title') - return False - - if 'content' in result: - if not isinstance(result['content'], str): - logger.debug('result: invalid content: %s', str(result)) - error_msgs.add('invalid content') - return False - - return True - - def _normalize_url_result(self, result): - """Return True if the result is valid""" - result['parsed_url'] = urlparse(result['url']) - - # if the result has no scheme, use http as default - if not result['parsed_url'].scheme: - result['parsed_url'] = result['parsed_url']._replace(scheme="http") - result['url'] = result['parsed_url'].geturl() - - # avoid duplicate content between the content and title fields - if result.get('content') == result.get('title'): - del result['content'] - - # make sure there is a template - if 'template' not in result: - result['template'] = 'default.html' - - # strip multiple spaces and carriage returns from content - if result.get('content'): - result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) - - def __merge_url_result(self, result, position): - result['engines'] = set([result['engine']]) - with self._lock: - duplicated = self.__find_duplicated_http_result(result) - if duplicated: - self.__merge_duplicated_http_result(duplicated, result, position) - return - - # if there is no duplicate found, append result - result['positions'] = [position] - self._merged_results.append(result) - - def __find_duplicated_http_result(self, result): - result_template = result.get('template') - for merged_result in self._merged_results: - if 'parsed_url' not in merged_result: - continue - if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get( - 'template' - ): - if result_template != 'images.html': - # not an image, same template, same url : it's a duplicate - return merged_result - else: - # it's an image - # it's a duplicate if the parsed_url, template and img_src are different - if result.get('img_src', '') == merged_result.get('img_src', ''): - return merged_result - return None - - def __merge_duplicated_http_result(self, duplicated, result, position): - # using content with more text - if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')): - duplicated['content'] = result['content'] - - # merge all result's parameters not found in duplicate - for key in result.keys(): - if not duplicated.get(key): - duplicated[key] = result.get(key) - - # add the new position - duplicated['positions'].append(position) - - # add engine to list of result-engines - duplicated['engines'].add(result['engine']) - - # using https if possible - if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': - duplicated['url'] = result['parsed_url'].geturl() - duplicated['parsed_url'] = result['parsed_url'] - - def __merge_result_no_url(self, result, position): - result['engines'] = set([result['engine']]) - result['positions'] = [position] - with self._lock: - self._merged_results.append(result) - - def close(self): - self._closed = True - - for result in self._merged_results: - score = result_score(result) - result['score'] = score - if result.get('content'): - result['content'] = utils.html_to_text(result['content']).strip() - # removing html content and whitespace duplications - result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split()) - for result_engine in result['engines']: - counter_add(score, 'engine', result_engine, 'score') - - results = sorted(self._merged_results, key=itemgetter('score'), reverse=True) - - # pass 2 : group results by category and template - gresults = [] - categoryPositions = {} - - for res in results: - # FIXME : handle more than one category per engine - engine = engines[res['engine']] - res['category'] = engine.categories[0] if len(engine.categories) > 0 else '' - - # FIXME : handle more than one category per engine - category = ( - res['category'] - + ':' - + res.get('template', '') - + ':' - + ('img_src' if 'img_src' in res or 'thumbnail' in res else '') - ) - - current = None if category not in categoryPositions else categoryPositions[category] - - # group with previous results using the same category - # if the group can accept more result and is not too far - # from the current position - if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20): - # group with the previous results using - # the same category with this one - index = current['index'] - gresults.insert(index, res) - - # update every index after the current one - # (including the current one) - for k in categoryPositions: - v = categoryPositions[k]['index'] - if v >= index: - categoryPositions[k]['index'] = v + 1 - - # update this category - current['count'] -= 1 - - else: - # same category - gresults.append(res) - - # update categoryIndex - categoryPositions[category] = {'index': len(gresults), 'count': 8} - - # update _merged_results - self._merged_results = gresults - - def get_ordered_results(self): - if not self._closed: - self.close() - return self._merged_results - - def results_length(self): - return len(self._merged_results) - - @property - def number_of_results(self) -> int: - """Returns the average of results number, returns zero if the average - result number is smaller than the actual result count.""" - - resultnum_sum = sum(self._number_of_results) - if not resultnum_sum or not self._number_of_results: - return 0 - - average = int(resultnum_sum / len(self._number_of_results)) - if average < self.results_length(): - average = 0 - return average - - def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False): - if engines[engine_name].display_error_messages: - self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended)) - - def add_timing(self, engine_name: str, engine_time: float, page_load_time: float): - self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time)) - - def get_timings(self): - return self.timings diff --git a/apps/searxng/searx/search/__init__.py b/apps/searxng/searx/search/__init__.py deleted file mode 100755 index 478424a..0000000 --- a/apps/searxng/searx/search/__init__.py +++ /dev/null @@ -1,214 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring, too-few-public-methods - -import threading -from copy import copy -from timeit import default_timer -from uuid import uuid4 - -import flask -from flask import copy_current_request_context -import babel - -from searx import settings -from searx.answerers import ask -from searx.external_bang import get_bang_url -from searx.results import ResultContainer -from searx import logger -from searx.plugins import plugins -from searx.search.models import EngineRef, SearchQuery -from searx.engines import load_engines -from searx.network import initialize as initialize_network, check_network_configuration -from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time -from searx.search.processors import PROCESSORS, initialize as initialize_processors -from searx.search.checker import initialize as initialize_checker - - -logger = logger.getChild('search') - - -def initialize(settings_engines=None, enable_checker=False, check_network=False, enable_metrics=True): - settings_engines = settings_engines or settings['engines'] - load_engines(settings_engines) - initialize_network(settings_engines, settings['outgoing']) - if check_network: - check_network_configuration() - initialize_metrics([engine['name'] for engine in settings_engines], enable_metrics) - initialize_processors(settings_engines) - if enable_checker: - initialize_checker() - - -class Search: - """Search information container""" - - __slots__ = "search_query", "result_container", "start_time", "actual_timeout" - - def __init__(self, search_query: SearchQuery): - """Initialize the Search""" - # init vars - super().__init__() - self.search_query = search_query - self.result_container = ResultContainer() - self.start_time = None - self.actual_timeout = None - - def search_external_bang(self): - """ - Check if there is a external bang. - If yes, update self.result_container and return True - """ - if self.search_query.external_bang: - self.result_container.redirect_url = get_bang_url(self.search_query) - - # This means there was a valid bang and the - # rest of the search does not need to be continued - if isinstance(self.result_container.redirect_url, str): - return True - return False - - def search_answerers(self): - """ - Check if an answer return a result. - If yes, update self.result_container and return True - """ - answerers_results = ask(self.search_query) - - if answerers_results: - for results in answerers_results: - self.result_container.extend('answer', results) - return True - return False - - # do search-request - def _get_requests(self): - # init vars - requests = [] - - # max of all selected engine timeout - default_timeout = 0 - - # start search-reqest for all selected engines - for engineref in self.search_query.engineref_list: - processor = PROCESSORS[engineref.name] - - # stop the request now if the engine is suspend - if processor.extend_container_if_suspended(self.result_container): - continue - - # set default request parameters - request_params = processor.get_params(self.search_query, engineref.category) - if request_params is None: - continue - - counter_inc('engine', engineref.name, 'search', 'count', 'sent') - - # append request to list - requests.append((engineref.name, self.search_query.query, request_params)) - - # update default_timeout - default_timeout = max(default_timeout, processor.engine.timeout) - - # adjust timeout - max_request_timeout = settings['outgoing']['max_request_timeout'] - actual_timeout = default_timeout - query_timeout = self.search_query.timeout_limit - - if max_request_timeout is None and query_timeout is None: - # No max, no user query: default_timeout - pass - elif max_request_timeout is None and query_timeout is not None: - # No max, but user query: From user query except if above default - actual_timeout = min(default_timeout, query_timeout) - elif max_request_timeout is not None and query_timeout is None: - # Max, no user query: Default except if above max - actual_timeout = min(default_timeout, max_request_timeout) - elif max_request_timeout is not None and query_timeout is not None: - # Max & user query: From user query except if above max - actual_timeout = min(query_timeout, max_request_timeout) - - logger.debug( - "actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})".format( - actual_timeout, default_timeout, query_timeout, max_request_timeout - ) - ) - - return requests, actual_timeout - - def search_multiple_requests(self, requests): - # pylint: disable=protected-access - search_id = str(uuid4()) - - for engine_name, query, request_params in requests: - _search = copy_current_request_context(PROCESSORS[engine_name].search) - th = threading.Thread( # pylint: disable=invalid-name - target=_search, - args=(query, request_params, self.result_container, self.start_time, self.actual_timeout), - name=search_id, - ) - th._timeout = False - th._engine_name = engine_name - th.start() - - for th in threading.enumerate(): # pylint: disable=invalid-name - if th.name == search_id: - remaining_time = max(0.0, self.actual_timeout - (default_timer() - self.start_time)) - th.join(remaining_time) - if th.is_alive(): - th._timeout = True - self.result_container.add_unresponsive_engine(th._engine_name, 'timeout') - PROCESSORS[th._engine_name].logger.error('engine timeout') - - def search_standard(self): - """ - Update self.result_container, self.actual_timeout - """ - requests, self.actual_timeout = self._get_requests() - - # send all search-request - if requests: - self.search_multiple_requests(requests) - - # return results, suggestions, answers and infoboxes - return True - - # do search-request - def search(self) -> ResultContainer: - self.start_time = default_timer() - if not self.search_external_bang(): - if not self.search_answerers(): - self.search_standard() - return self.result_container - - -class SearchWithPlugins(Search): - """Inherit from the Search class, add calls to the plugins.""" - - __slots__ = 'ordered_plugin_list', 'request' - - def __init__(self, search_query: SearchQuery, ordered_plugin_list, request: flask.Request): - super().__init__(search_query) - self.ordered_plugin_list = ordered_plugin_list - self.result_container.on_result = self._on_result - # pylint: disable=line-too-long - # get the "real" request to use it outside the Flask context. - # see - # * https://github.com/pallets/flask/blob/d01d26e5210e3ee4cbbdef12f05c886e08e92852/src/flask/globals.py#L55 - # * https://github.com/pallets/werkzeug/blob/3c5d3c9bd0d9ce64590f0af8997a38f3823b368d/src/werkzeug/local.py#L548-L559 - # * https://werkzeug.palletsprojects.com/en/2.0.x/local/#werkzeug.local.LocalProxy._get_current_object - # pylint: enable=line-too-long - self.request = request._get_current_object() - - def _on_result(self, result): - return plugins.call(self.ordered_plugin_list, 'on_result', self.request, self, result) - - def search(self) -> ResultContainer: - if plugins.call(self.ordered_plugin_list, 'pre_search', self.request, self): - super().search() - - plugins.call(self.ordered_plugin_list, 'post_search', self.request, self) - - self.result_container.close() - - return self.result_container diff --git a/apps/searxng/searx/search/checker/__init__.py b/apps/searxng/searx/search/checker/__init__.py deleted file mode 100755 index 7d779a2..0000000 --- a/apps/searxng/searx/search/checker/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -from .impl import Checker -from .background import initialize, get_result - -__all__ = ('Checker', 'initialize', 'get_result') diff --git a/apps/searxng/searx/search/checker/__main__.py b/apps/searxng/searx/search/checker/__main__.py deleted file mode 100755 index 15fcb5e..0000000 --- a/apps/searxng/searx/search/checker/__main__.py +++ /dev/null @@ -1,114 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring - -import sys -import io -import os -import argparse -import logging - -import searx.search -import searx.search.checker -from searx.search import PROCESSORS -from searx.engines import engine_shortcuts - - -# configure logging -root = logging.getLogger() -handler = logging.StreamHandler(sys.stdout) -for h in root.handlers: - root.removeHandler(h) -root.addHandler(handler) - -# color only for a valid terminal -if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: - RESET_SEQ = "\033[0m" - COLOR_SEQ = "\033[1;%dm" - BOLD_SEQ = "\033[1m" - BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) -else: - RESET_SEQ = "" - COLOR_SEQ = "" - BOLD_SEQ = "" - BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" - -# equivalent of 'python -u' (unbuffered stdout, stderr) -stdout = io.TextIOWrapper( - # pylint: disable=consider-using-with - open(sys.stdout.fileno(), 'wb', 0), - write_through=True, -) -stderr = io.TextIOWrapper( - # pylint: disable=consider-using-with - open(sys.stderr.fileno(), 'wb', 0), - write_through=True, -) - - -# iterator of processors -def iter_processor(engine_name_list): - if len(engine_name_list) > 0: - for name in engine_name_list: - name = engine_shortcuts.get(name, name) - processor = PROCESSORS.get(name) - if processor is not None: - yield name, processor - else: - stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}') - else: - for name, processor in searx.search.PROCESSORS.items(): - yield name, processor - - -# actual check & display -def run(engine_name_list, verbose): - searx.search.initialize() - for name, processor in iter_processor(engine_name_list): - stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') - if not sys.stdout.isatty(): - stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') - checker = searx.search.checker.Checker(processor) - checker.run() - if checker.test_results.successful: - stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n') - if verbose: - stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') - else: - stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}') - if not verbose: - errors = [test_name + ': ' + error for test_name, error in checker.test_results] - stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n') - else: - stdout.write('\n') - stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') - for test_name, logs in checker.test_results.logs.items(): - for log in logs: - log = map(lambda l: l if isinstance(l, str) else repr(l), log) - stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n') - - -# call by setup.py -def main(): - parser = argparse.ArgumentParser(description='Check searx engines.') - parser.add_argument( - 'engine_name_list', - metavar='engine name', - type=str, - nargs='*', - help='engines name or shortcut list. Empty for all engines.', - ) - parser.add_argument( - '--verbose', - '-v', - action='store_true', - dest='verbose', - help='Display details about the test results', - default=False, - ) - args = parser.parse_args() - run(args.engine_name_list, args.verbose) - - -if __name__ == '__main__': - main() diff --git a/apps/searxng/searx/search/checker/background.py b/apps/searxng/searx/search/checker/background.py deleted file mode 100755 index aec2a17..0000000 --- a/apps/searxng/searx/search/checker/background.py +++ /dev/null @@ -1,171 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring -# pyright: basic - -import json -import time -import threading -import os -import signal -from typing import Dict, Union, List, Any, Tuple, Optional -from typing_extensions import TypedDict, Literal - -import redis.exceptions - -from searx import logger, settings, searx_debug -from searx.redisdb import client as get_redis_client -from searx.exceptions import SearxSettingsException -from searx.search.processors import PROCESSORS -from searx.search.checker import Checker -from searx.search.checker.scheduler import scheduler_function - - -REDIS_RESULT_KEY = 'SearXNG_checker_result' -REDIS_LOCK_KEY = 'SearXNG_checker_lock' - - -CheckerResult = Union['CheckerOk', 'CheckerErr', 'CheckerOther'] - - -class CheckerOk(TypedDict): - """Checking the engines succeeded""" - - status: Literal['ok'] - engines: Dict[str, 'EngineResult'] - timestamp: int - - -class CheckerErr(TypedDict): - """Checking the engines failed""" - - status: Literal['error'] - timestamp: int - - -class CheckerOther(TypedDict): - """The status is unknown or disabled""" - - status: Literal['unknown', 'disabled'] - - -EngineResult = Union['EngineOk', 'EngineErr'] - - -class EngineOk(TypedDict): - """Checking the engine succeeded""" - - success: Literal[True] - - -class EngineErr(TypedDict): - """Checking the engine failed""" - - success: Literal[False] - errors: Dict[str, List[str]] - - -def _get_interval(every: Any, error_msg: str) -> Tuple[int, int]: - if isinstance(every, int): - return (every, every) - - if ( - not isinstance(every, (tuple, list)) - or len(every) != 2 # type: ignore - or not isinstance(every[0], int) - or not isinstance(every[1], int) - ): - raise SearxSettingsException(error_msg, None) - return (every[0], every[1]) - - -def get_result() -> CheckerResult: - client = get_redis_client() - if client is None: - # without Redis, the checker is disabled - return {'status': 'disabled'} - serialized_result: Optional[bytes] = client.get(REDIS_RESULT_KEY) - if serialized_result is None: - # the Redis key does not exist - return {'status': 'unknown'} - return json.loads(serialized_result) - - -def _set_result(result: CheckerResult): - client = get_redis_client() - if client is None: - # without Redis, the function does nothing - return - client.set(REDIS_RESULT_KEY, json.dumps(result)) - - -def _timestamp(): - return int(time.time() / 3600) * 3600 - - -def run(): - try: - # use a Redis lock to make sure there is no checker running at the same time - # (this should not happen, this is a safety measure) - with get_redis_client().lock(REDIS_LOCK_KEY, blocking_timeout=60, timeout=3600): - logger.info('Starting checker') - result: CheckerOk = {'status': 'ok', 'engines': {}, 'timestamp': _timestamp()} - for name, processor in PROCESSORS.items(): - logger.debug('Checking %s engine', name) - checker = Checker(processor) - checker.run() - if checker.test_results.successful: - result['engines'][name] = {'success': True} - else: - result['engines'][name] = {'success': False, 'errors': checker.test_results.errors} - - _set_result(result) - logger.info('Check done') - except redis.exceptions.LockError: - _set_result({'status': 'error', 'timestamp': _timestamp()}) - logger.exception('Error while running the checker') - except Exception: # pylint: disable=broad-except - _set_result({'status': 'error', 'timestamp': _timestamp()}) - logger.exception('Error while running the checker') - - -def _signal_handler(_signum: int, _frame: Any): - t = threading.Thread(target=run) - t.daemon = True - t.start() - - -def initialize(): - if hasattr(signal, 'SIGUSR1'): - # Windows doesn't support SIGUSR1 - logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) - signal.signal(signal.SIGUSR1, _signal_handler) - - # special case when debug is activate - if searx_debug and settings['checker']['off_when_debug']: - logger.info('debug mode: checker is disabled') - return - - # check value of checker.scheduling.every now - scheduling = settings['checker']['scheduling'] - if scheduling is None or not scheduling: - logger.info('Checker scheduler is disabled') - return - - # make sure there is a Redis connection - if get_redis_client() is None: - logger.error('The checker requires Redis') - return - - # start the background scheduler - every_range = _get_interval(scheduling.get('every', (300, 1800)), 'checker.scheduling.every is not a int or list') - start_after_range = _get_interval( - scheduling.get('start_after', (300, 1800)), 'checker.scheduling.start_after is not a int or list' - ) - t = threading.Thread( - target=scheduler_function, - args=(start_after_range[0], start_after_range[1], every_range[0], every_range[1], run), - name='checker_scheduler', - ) - t.daemon = True - t.start() diff --git a/apps/searxng/searx/search/checker/impl.py b/apps/searxng/searx/search/checker/impl.py deleted file mode 100755 index 37f145e..0000000 --- a/apps/searxng/searx/search/checker/impl.py +++ /dev/null @@ -1,442 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -import gc -import typing -import types -import functools -import itertools -from time import time -from timeit import default_timer -from urllib.parse import urlparse - -import re -import httpx - -from searx import network, logger -from searx.utils import gen_useragent, detect_language -from searx.results import ResultContainer -from searx.search.models import SearchQuery, EngineRef -from searx.search.processors import EngineProcessor -from searx.metrics import counter_inc - - -logger = logger.getChild('searx.search.checker') - -HTML_TAGS = [ - # fmt: off - 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', - 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', - 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', - 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', - 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', - 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', - 'frame', 'frameset' - # fmt: on -] - - -def get_check_no_html(): - rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS] - rep += ['' for tag in HTML_TAGS] - pattern = re.compile('|'.join(rep)) - - def f(text): - return pattern.search(text.lower()) is None - - return f - - -_check_no_html = get_check_no_html() - - -def _is_url(url): - try: - result = urlparse(url) - except ValueError: - return False - if result.scheme not in ('http', 'https'): - return False - return True - - -@functools.lru_cache(maxsize=8192) -def _download_and_check_if_image(image_url: str) -> bool: - """Download an URL and check if the Content-Type starts with "image/" - This function should not be called directly: use _is_url_image - otherwise the cache of functools.lru_cache contains data: URL which might be huge. - """ - retry = 2 - - while retry > 0: - a = time() - try: - # use "image_proxy" (avoid HTTP/2) - network.set_context_network_name('image_proxy') - r, stream = network.stream( - 'GET', - image_url, - timeout=10.0, - allow_redirects=True, - headers={ - 'User-Agent': gen_useragent(), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US;q=0.5,en;q=0.3', - 'Accept-Encoding': 'gzip, deflate, br', - 'DNT': '1', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'Sec-GPC': '1', - 'Cache-Control': 'max-age=0', - }, - ) - r.close() - if r.status_code == 200: - is_image = r.headers.get('content-type', '').startswith('image/') - else: - is_image = False - del r - del stream - return is_image - except httpx.TimeoutException: - logger.error('Timeout for %s: %i', image_url, int(time() - a)) - retry -= 1 - except httpx.HTTPError: - logger.exception('Exception for %s', image_url) - return False - return False - - -def _is_url_image(image_url) -> bool: - """Normalize image_url""" - if not isinstance(image_url, str): - return False - - if image_url.startswith('//'): - image_url = 'https:' + image_url - - if image_url.startswith('data:'): - return image_url.startswith('data:image/') - - if not _is_url(image_url): - return False - - return _download_and_check_if_image(image_url) - - -def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: - return { - 'query': search_query.query, - 'lang': search_query.lang, - 'pageno': search_query.pageno, - 'safesearch': search_query.safesearch, - 'time_range': search_query.time_range, - } - - -def _search_query_diff( - sq1: SearchQuery, sq2: SearchQuery -) -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: - param1 = _search_query_to_dict(sq1) - param2 = _search_query_to_dict(sq2) - common = {} - diff = {} - for k, value1 in param1.items(): - value2 = param2[k] - if value1 == value2: - common[k] = value1 - else: - diff[k] = (value1, value2) - return (common, diff) - - -class TestResults: - - __slots__ = 'errors', 'logs', 'languages' - - def __init__(self): - self.errors: typing.Dict[str, typing.List[str]] = {} - self.logs: typing.Dict[str, typing.List[typing.Any]] = {} - self.languages: typing.Set[str] = set() - - def add_error(self, test, message, *args): - # message to self.errors - errors_for_test = self.errors.setdefault(test, []) - if message not in errors_for_test: - errors_for_test.append(message) - # (message, *args) to self.logs - logs_for_test = self.logs.setdefault(test, []) - if (message, *args) not in logs_for_test: - logs_for_test.append((message, *args)) - - def add_language(self, language): - self.languages.add(language) - - @property - def successful(self): - return len(self.errors) == 0 - - def __iter__(self): - for test_name, errors in self.errors.items(): - for error in sorted(errors): - yield (test_name, error) - - -class ResultContainerTests: - - __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' - - def __init__( - self, test_results: TestResults, test_name: str, search_query: SearchQuery, result_container: ResultContainer - ): - self.test_name = test_name - self.search_query = search_query - self.result_container = result_container - self.languages: typing.Set[str] = set() - self.test_results = test_results - self.stop_test = False - - @property - def result_urls(self): - results = self.result_container.get_ordered_results() - return [result['url'] for result in results if 'url' in result] - - def _record_error(self, message: str, *args) -> None: - sq = _search_query_to_dict(self.search_query) - sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()]) - self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') - - def _add_language(self, text: str) -> typing.Optional[str]: - langStr = detect_language(text) - if langStr: - self.languages.add(langStr) - self.test_results.add_language(langStr) - return None - - def _check_result(self, result): - if not _check_no_html(result.get('title', '')): - self._record_error('HTML in title', repr(result.get('title', ''))) - if not _check_no_html(result.get('content', '')): - self._record_error('HTML in content', repr(result.get('content', ''))) - if result.get('url') is None: - self._record_error('url is None') - - self._add_language(result.get('title', '')) - self._add_language(result.get('content', '')) - - template = result.get('template', 'default.html') - if template == 'default.html': - return - if template == 'code.html': - return - if template == 'torrent.html': - return - if template == 'map.html': - return - if template == 'images.html': - thumbnail_src = result.get('thumbnail_src') - if thumbnail_src is not None: - if not _is_url_image(thumbnail_src): - self._record_error('thumbnail_src URL is invalid', thumbnail_src) - elif not _is_url_image(result.get('img_src')): - self._record_error('img_src URL is invalid', result.get('img_src')) - if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): - self._record_error('thumbnail URL is invalid', result.get('img_src')) - - def _check_results(self, results: list): - for result in results: - self._check_result(result) - - def _check_answers(self, answers): - for answer in answers: - if not _check_no_html(answer): - self._record_error('HTML in answer', answer) - - def _check_infoboxes(self, infoboxes): - for infobox in infoboxes: - if not _check_no_html(infobox.get('content', '')): - self._record_error('HTML in infobox content', infobox.get('content', '')) - self._add_language(infobox.get('content', '')) - for attribute in infobox.get('attributes', {}): - if not _check_no_html(attribute.get('value', '')): - self._record_error('HTML in infobox attribute value', attribute.get('value', '')) - - def check_basic(self): - if len(self.result_container.unresponsive_engines) > 0: - for message in self.result_container.unresponsive_engines: - self._record_error(message[1] + ' ' + (message[2] or '')) - self.stop_test = True - return - - results = self.result_container.get_ordered_results() - if len(results) > 0: - self._check_results(results) - - if len(self.result_container.answers) > 0: - self._check_answers(self.result_container.answers) - - if len(self.result_container.infoboxes) > 0: - self._check_infoboxes(self.result_container.infoboxes) - - def has_infobox(self): - """Check the ResultContainer has at least one infobox""" - if len(self.result_container.infoboxes) == 0: - self._record_error('No infobox') - - def has_answer(self): - """Check the ResultContainer has at least one answer""" - if len(self.result_container.answers) == 0: - self._record_error('No answer') - - def has_language(self, lang): - """Check at least one title or content of the results is written in the `lang`. - - Detected using pycld3, may be not accurate""" - if lang not in self.languages: - self._record_error(lang + ' not found') - - def not_empty(self): - """Check the ResultContainer has at least one answer or infobox or result""" - result_types = set() - results = self.result_container.get_ordered_results() - if len(results) > 0: - result_types.add('results') - - if len(self.result_container.answers) > 0: - result_types.add('answers') - - if len(self.result_container.infoboxes) > 0: - result_types.add('infoboxes') - - if len(result_types) == 0: - self._record_error('No result') - - def one_title_contains(self, title: str): - """Check one of the title contains `title` (case insensitive comparison)""" - title = title.lower() - for result in self.result_container.get_ordered_results(): - if title in result['title'].lower(): - return - self._record_error(('{!r} not found in the title'.format(title))) - - -class CheckerTests: - - __slots__ = 'test_results', 'test_name', 'result_container_tests_list' - - def __init__( - self, test_results: TestResults, test_name: str, result_container_tests_list: typing.List[ResultContainerTests] - ): - self.test_results = test_results - self.test_name = test_name - self.result_container_tests_list = result_container_tests_list - - def unique_results(self): - """Check the results of each ResultContainer is unique""" - urls_list = [rct.result_urls for rct in self.result_container_tests_list] - if len(urls_list[0]) > 0: - # results on the first page - for i, urls_i in enumerate(urls_list): - for j, urls_j in enumerate(urls_list): - if i < j and urls_i == urls_j: - common, diff = _search_query_diff( - self.result_container_tests_list[i].search_query, - self.result_container_tests_list[j].search_query, - ) - common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) - diff1_str = ', '.join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) - diff2_str = ', '.join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) - self.test_results.add_error( - self.test_name, - 'results are identitical for {} and {} ({})'.format(diff1_str, diff2_str, common_str), - ) - - -class Checker: - - __slots__ = 'processor', 'tests', 'test_results' - - def __init__(self, processor: EngineProcessor): - self.processor = processor - self.tests = self.processor.get_tests() - self.test_results = TestResults() - - @property - def engineref_list(self): - engine_name = self.processor.engine_name - engine_category = self.processor.engine.categories[0] - return [EngineRef(engine_name, engine_category)] - - @staticmethod - def search_query_matrix_iterator(engineref_list, matrix): - p = [] - for name, values in matrix.items(): - if isinstance(values, (tuple, list)): - l = [(name, value) for value in values] - else: - l = [(name, values)] - p.append(l) - - for kwargs in itertools.product(*p): - kwargs = {k: v for k, v in kwargs} - query = kwargs['query'] - params = dict(kwargs) - del params['query'] - yield SearchQuery(query, engineref_list, **params) - - def call_test(self, obj, test_description): - if isinstance(test_description, (tuple, list)): - method, args = test_description[0], test_description[1:] - else: - method = test_description - args = () - if isinstance(method, str) and hasattr(obj, method): - getattr(obj, method)(*args) - elif isinstance(method, types.FunctionType): - method(*args) - else: - self.test_results.add_error( - obj.test_name, - 'method {!r} ({}) not found for {}'.format(method, method.__class__.__name__, obj.__class__.__name__), - ) - - def call_tests(self, obj, test_descriptions): - for test_description in test_descriptions: - self.call_test(obj, test_description) - - def search(self, search_query: SearchQuery) -> ResultContainer: - result_container = ResultContainer() - engineref_category = search_query.engineref_list[0].category - params = self.processor.get_params(search_query, engineref_category) - if params is not None: - counter_inc('engine', search_query.engineref_list[0].name, 'search', 'count', 'sent') - self.processor.search(search_query.query, params, result_container, default_timer(), 5) - return result_container - - def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: - result_container = self.search(search_query) - result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) - result_container_check.check_basic() - return result_container_check - - def run_test(self, test_name): - test_parameters = self.tests[test_name] - search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) - rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] - stop_test = False - if 'result_container' in test_parameters: - for rct in rct_list: - stop_test = stop_test or rct.stop_test - if not rct.stop_test: - self.call_tests(rct, test_parameters['result_container']) - if not stop_test: - if 'test' in test_parameters: - checker_tests = CheckerTests(self.test_results, test_name, rct_list) - self.call_tests(checker_tests, test_parameters['test']) - - def run(self): - for test_name in self.tests: - self.run_test(test_name) - # clear cache - _download_and_check_if_image.cache_clear() - # force a garbage collector - gc.collect() diff --git a/apps/searxng/searx/search/checker/scheduler.py b/apps/searxng/searx/search/checker/scheduler.py deleted file mode 100755 index cc3bb73..0000000 --- a/apps/searxng/searx/search/checker/scheduler.py +++ /dev/null @@ -1,57 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring -"""Lame scheduler which use Redis as a source of truth: -* the Redis key SearXNG_checker_next_call_ts contains the next time the embedded checker should run. -* to avoid lock, a unique Redis script reads and updates the Redis key SearXNG_checker_next_call_ts. -* this Redis script returns a list of two elements: - * the first one is a boolean. If True, the embedded checker must run now in this worker. - * the second element is the delay in second to wait before the next call to the Redis script. - -This scheduler is not generic on purpose: if more feature are required, a dedicate scheduler must be used -(= a better scheduler should not use the web workers) -""" - -import logging -import time -import importlib -from typing import Callable - -from searx.redisdb import client as get_redis_client -from searx.redislib import lua_script_storage - - -logger = logging.getLogger('searx.search.checker') - - -def scheduler_function(start_after_from: int, start_after_to: int, every_from: int, every_to: int, callback: Callable): - """Run the checker periodically. The function never returns. - - Parameters: - * start_after_from and start_after_to: when to call "callback" for the first on the Redis instance - * every_from and every_to: after the first call, how often to call "callback" - - There is no issue: - * to call this function is multiple workers - * to kill workers at any time as long there is one at least one worker - """ - scheduler_now_script = importlib.resources.read_text(__package__, "scheduler.lua") - while True: - # ask the Redis script what to do - # the script says - # * if the checker must run now. - # * how to long to way before calling the script again (it can be call earlier, but not later). - script = lua_script_storage(get_redis_client(), scheduler_now_script) - call_now, wait_time = script(args=[start_after_from, start_after_to, every_from, every_to]) - - # does the worker run the checker now? - if call_now: - # run the checker - try: - callback() - except Exception: # pylint: disable=broad-except - logger.exception("Error calling the embedded checker") - # only worker display the wait_time - logger.info("Next call to the checker in %s seconds", wait_time) - # wait until the next call - time.sleep(wait_time) diff --git a/apps/searxng/searx/search/models.py b/apps/searxng/searx/search/models.py deleted file mode 100755 index 91e5d59..0000000 --- a/apps/searxng/searx/search/models.py +++ /dev/null @@ -1,124 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -import typing -import babel - - -class EngineRef: - """Reference by names to an engine and category""" - - __slots__ = 'name', 'category' - - def __init__(self, name: str, category: str): - self.name = name - self.category = category - - def __repr__(self): - return "EngineRef({!r}, {!r})".format(self.name, self.category) - - def __eq__(self, other): - return self.name == other.name and self.category == other.category - - def __hash__(self): - return hash((self.name, self.category)) - - -class SearchQuery: - """container for all the search parameters (query, language, etc...)""" - - __slots__ = ( - 'query', - 'engineref_list', - 'lang', - 'locale', - 'safesearch', - 'pageno', - 'time_range', - 'timeout_limit', - 'external_bang', - 'engine_data', - ) - - def __init__( - self, - query: str, - engineref_list: typing.List[EngineRef], - lang: str = 'all', - safesearch: int = 0, - pageno: int = 1, - time_range: typing.Optional[str] = None, - timeout_limit: typing.Optional[float] = None, - external_bang: typing.Optional[str] = None, - engine_data: typing.Optional[typing.Dict[str, str]] = None, - ): - self.query = query - self.engineref_list = engineref_list - self.lang = lang - self.safesearch = safesearch - self.pageno = pageno - self.time_range = time_range - self.timeout_limit = timeout_limit - self.external_bang = external_bang - self.engine_data = engine_data or {} - - self.locale = None - if self.lang: - try: - self.locale = babel.Locale.parse(self.lang, sep='-') - except babel.core.UnknownLocaleError: - pass - - @property - def categories(self): - return list(set(map(lambda engineref: engineref.category, self.engineref_list))) - - def __repr__(self): - return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".format( - self.query, - self.engineref_list, - self.lang, - self.safesearch, - self.pageno, - self.time_range, - self.timeout_limit, - self.external_bang, - ) - - def __eq__(self, other): - return ( - self.query == other.query - and self.engineref_list == other.engineref_list - and self.lang == other.lang - and self.safesearch == other.safesearch - and self.pageno == other.pageno - and self.time_range == other.time_range - and self.timeout_limit == other.timeout_limit - and self.external_bang == other.external_bang - ) - - def __hash__(self): - return hash( - ( - self.query, - tuple(self.engineref_list), - self.lang, - self.safesearch, - self.pageno, - self.time_range, - self.timeout_limit, - self.external_bang, - ) - ) - - def __copy__(self): - return SearchQuery( - self.query, - self.engineref_list, - self.lang, - self.safesearch, - self.pageno, - self.time_range, - self.timeout_limit, - self.external_bang, - self.engine_data, - ) diff --git a/apps/searxng/searx/search/processors/__init__.py b/apps/searxng/searx/search/processors/__init__.py deleted file mode 100755 index 1390de4..0000000 --- a/apps/searxng/searx/search/processors/__init__.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint - -"""Implement request processores used by engine-types. - -""" - -__all__ = [ - 'EngineProcessor', - 'OfflineProcessor', - 'OnlineProcessor', - 'OnlineDictionaryProcessor', - 'OnlineCurrencyProcessor', - 'OnlineUrlSearchProcessor', - 'PROCESSORS', -] - -import threading -from typing import Dict - -from searx import logger -from searx import engines - -from .online import OnlineProcessor -from .offline import OfflineProcessor -from .online_dictionary import OnlineDictionaryProcessor -from .online_currency import OnlineCurrencyProcessor -from .online_url_search import OnlineUrlSearchProcessor -from .abstract import EngineProcessor - -logger = logger.getChild('search.processors') -PROCESSORS: Dict[str, EngineProcessor] = {} -"""Cache request processores, stored by *engine-name* (:py:func:`initialize`) - -:meta hide-value: -""" - - -def get_processor_class(engine_type): - """Return processor class according to the ``engine_type``""" - for c in [ - OnlineProcessor, - OfflineProcessor, - OnlineDictionaryProcessor, - OnlineCurrencyProcessor, - OnlineUrlSearchProcessor, - ]: - if c.engine_type == engine_type: - return c - return None - - -def get_processor(engine, engine_name): - """Return processor instance that fits to ``engine.engine.type``)""" - engine_type = getattr(engine, 'engine_type', 'online') - processor_class = get_processor_class(engine_type) - if processor_class: - return processor_class(engine, engine_name) - return None - - -def initialize_processor(processor): - """Initialize one processor - - Call the init function of the engine - """ - if processor.has_initialize_function: - t = threading.Thread(target=processor.initialize, daemon=True) - t.start() - - -def initialize(engine_list): - """Initialize all engines and store a processor for each engine in :py:obj:`PROCESSORS`.""" - for engine_data in engine_list: - engine_name = engine_data['name'] - engine = engines.engines.get(engine_name) - if engine: - processor = get_processor(engine, engine_name) - initialize_processor(processor) - if processor is None: - engine.logger.error('Error get processor for engine %s', engine_name) - else: - PROCESSORS[engine_name] = processor diff --git a/apps/searxng/searx/search/processors/abstract.py b/apps/searxng/searx/search/processors/abstract.py deleted file mode 100755 index ace730e..0000000 --- a/apps/searxng/searx/search/processors/abstract.py +++ /dev/null @@ -1,191 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint - -"""Abstract base classes for engine request processores. - -""" - -import threading -from abc import abstractmethod, ABC -from timeit import default_timer -from typing import Dict, Union - -from searx import settings, logger -from searx.engines import engines -from searx.network import get_time_for_thread, get_network -from searx.metrics import histogram_observe, counter_inc, count_exception, count_error -from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineResponseException -from searx.utils import get_engine_from_settings - -logger = logger.getChild('searx.search.processor') -SUSPENDED_STATUS: Dict[Union[int, str], 'SuspendedStatus'] = {} - - -class SuspendedStatus: - """Class to handle suspend state.""" - - __slots__ = 'suspend_end_time', 'suspend_reason', 'continuous_errors', 'lock' - - def __init__(self): - self.lock = threading.Lock() - self.continuous_errors = 0 - self.suspend_end_time = 0 - self.suspend_reason = None - - @property - def is_suspended(self): - return self.suspend_end_time >= default_timer() - - def suspend(self, suspended_time, suspend_reason): - with self.lock: - # update continuous_errors / suspend_end_time - self.continuous_errors += 1 - if suspended_time is None: - suspended_time = min( - settings['search']['max_ban_time_on_fail'], - self.continuous_errors * settings['search']['ban_time_on_fail'], - ) - self.suspend_end_time = default_timer() + suspended_time - self.suspend_reason = suspend_reason - logger.debug('Suspend for %i seconds', suspended_time) - - def resume(self): - with self.lock: - # reset the suspend variables - self.continuous_errors = 0 - self.suspend_end_time = 0 - self.suspend_reason = None - - -class EngineProcessor(ABC): - """Base classes used for all types of reqest processores.""" - - __slots__ = 'engine', 'engine_name', 'lock', 'suspended_status', 'logger' - - def __init__(self, engine, engine_name: str): - self.engine = engine - self.engine_name = engine_name - self.logger = engines[engine_name].logger - key = get_network(self.engine_name) - key = id(key) if key else self.engine_name - self.suspended_status = SUSPENDED_STATUS.setdefault(key, SuspendedStatus()) - - def initialize(self): - try: - self.engine.init(get_engine_from_settings(self.engine_name)) - except SearxEngineResponseException as exc: - self.logger.warning('Fail to initialize // %s', exc) - except Exception: # pylint: disable=broad-except - self.logger.exception('Fail to initialize') - else: - self.logger.debug('Initialized') - - @property - def has_initialize_function(self): - return hasattr(self.engine, 'init') - - def handle_exception(self, result_container, exception_or_message, suspend=False): - # update result_container - if isinstance(exception_or_message, BaseException): - exception_class = exception_or_message.__class__ - module_name = getattr(exception_class, '__module__', 'builtins') - module_name = '' if module_name == 'builtins' else module_name + '.' - error_message = module_name + exception_class.__qualname__ - else: - error_message = exception_or_message - result_container.add_unresponsive_engine(self.engine_name, error_message) - # metrics - counter_inc('engine', self.engine_name, 'search', 'count', 'error') - if isinstance(exception_or_message, BaseException): - count_exception(self.engine_name, exception_or_message) - else: - count_error(self.engine_name, exception_or_message) - # suspend the engine ? - if suspend: - suspended_time = None - if isinstance(exception_or_message, SearxEngineAccessDeniedException): - suspended_time = exception_or_message.suspended_time - self.suspended_status.suspend(suspended_time, error_message) # pylint: disable=no-member - - def _extend_container_basic(self, result_container, start_time, search_results): - # update result_container - result_container.extend(self.engine_name, search_results) - engine_time = default_timer() - start_time - page_load_time = get_time_for_thread() - result_container.add_timing(self.engine_name, engine_time, page_load_time) - # metrics - counter_inc('engine', self.engine_name, 'search', 'count', 'successful') - histogram_observe(engine_time, 'engine', self.engine_name, 'time', 'total') - if page_load_time is not None: - histogram_observe(page_load_time, 'engine', self.engine_name, 'time', 'http') - - def extend_container(self, result_container, start_time, search_results): - if getattr(threading.current_thread(), '_timeout', False): - # the main thread is not waiting anymore - self.handle_exception(result_container, 'timeout', None) - else: - # check if the engine accepted the request - if search_results is not None: - self._extend_container_basic(result_container, start_time, search_results) - self.suspended_status.resume() - - def extend_container_if_suspended(self, result_container): - if self.suspended_status.is_suspended: - result_container.add_unresponsive_engine( - self.engine_name, self.suspended_status.suspend_reason, suspended=True - ) - return True - return False - - def get_params(self, search_query, engine_category): - """Returns a set of (see :ref:`request params `) or - ``None`` if request is not supported. - - Not supported conditions (``None`` is returned): - - - A page-number > 1 when engine does not support paging. - - A time range when the engine does not support time range. - """ - # if paging is not supported, skip - if search_query.pageno > 1 and not self.engine.paging: - return None - - # if time_range is not supported, skip - if search_query.time_range and not self.engine.time_range_support: - return None - - params = {} - params['category'] = engine_category - params['pageno'] = search_query.pageno - params['safesearch'] = search_query.safesearch - params['time_range'] = search_query.time_range - params['engine_data'] = search_query.engine_data.get(self.engine_name, {}) - params['searxng_locale'] = search_query.lang - - # deprecated / vintage --> use params['searxng_locale'] - # - # Conditions related to engine's traits are implemented in engine.traits - # module. Don't do 'locale' decissions here in the abstract layer of the - # search processor, just pass the value from user's choice unchanged to - # the engine request. - - if hasattr(self.engine, 'language') and self.engine.language: - params['language'] = self.engine.language - else: - params['language'] = search_query.lang - - return params - - @abstractmethod - def search(self, query, params, result_container, start_time, timeout_limit): - pass - - def get_tests(self): - tests = getattr(self.engine, 'tests', None) - if tests is None: - tests = getattr(self.engine, 'additional_tests', {}) - tests.update(self.get_default_tests()) - return tests - - def get_default_tests(self): - return {} diff --git a/apps/searxng/searx/search/processors/offline.py b/apps/searxng/searx/search/processors/offline.py deleted file mode 100755 index 13f077c..0000000 --- a/apps/searxng/searx/search/processors/offline.py +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint - -"""Processores for engine-type: ``offline`` - -""" - -from .abstract import EngineProcessor - - -class OfflineProcessor(EngineProcessor): - """Processor class used by ``offline`` engines""" - - engine_type = 'offline' - - def _search_basic(self, query, params): - return self.engine.search(query, params) - - def search(self, query, params, result_container, start_time, timeout_limit): - try: - search_results = self._search_basic(query, params) - self.extend_container(result_container, start_time, search_results) - except ValueError as e: - # do not record the error - self.logger.exception('engine {0} : invalid input : {1}'.format(self.engine_name, e)) - except Exception as e: # pylint: disable=broad-except - self.handle_exception(result_container, e) - self.logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e)) diff --git a/apps/searxng/searx/search/processors/online.py b/apps/searxng/searx/search/processors/online.py deleted file mode 100755 index 7b2ec85..0000000 --- a/apps/searxng/searx/search/processors/online.py +++ /dev/null @@ -1,238 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint - -"""Processores for engine-type: ``online`` - -""" -# pylint: disable=use-dict-literal - -from timeit import default_timer -import asyncio -import ssl -import httpx - -import searx.network -from searx.utils import gen_useragent -from searx.exceptions import ( - SearxEngineAccessDeniedException, - SearxEngineCaptchaException, - SearxEngineTooManyRequestsException, -) -from searx.metrics.error_recorder import count_error -from .abstract import EngineProcessor - - -def default_request_params(): - """Default request parameters for ``online`` engines.""" - return { - # fmt: off - 'method': 'GET', - 'headers': {}, - 'data': {}, - 'url': '', - 'cookies': {}, - 'auth': None - # fmt: on - } - - -class OnlineProcessor(EngineProcessor): - """Processor class for ``online`` engines.""" - - engine_type = 'online' - - def initialize(self): - # set timeout for all HTTP requests - searx.network.set_timeout_for_thread(self.engine.timeout, start_time=default_timer()) - # reset the HTTP total time - searx.network.reset_time_for_thread() - # set the network - searx.network.set_context_network_name(self.engine_name) - super().initialize() - - def get_params(self, search_query, engine_category): - """Returns a set of :ref:`request params ` or ``None`` - if request is not supported. - """ - params = super().get_params(search_query, engine_category) - if params is None: - return None - - # add default params - params.update(default_request_params()) - - # add an user agent - params['headers']['User-Agent'] = gen_useragent() - - # add Accept-Language header - if self.engine.send_accept_language_header and search_query.locale: - ac_lang = search_query.locale.language - if search_query.locale.territory: - ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( - search_query.locale.language, - search_query.locale.territory, - search_query.locale.language, - ) - params['headers']['Accept-Language'] = ac_lang - - self.logger.debug('HTTP Accept-Language: %s', params['headers'].get('Accept-Language', '')) - return params - - def _send_http_request(self, params): - # create dictionary which contain all - # information about the request - request_args = dict(headers=params['headers'], cookies=params['cookies'], auth=params['auth']) - - # verify - # if not None, it overrides the verify value defined in the network. - # use False to accept any server certificate - # use a path to file to specify a server certificate - verify = params.get('verify') - if verify is not None: - request_args['verify'] = params['verify'] - - # max_redirects - max_redirects = params.get('max_redirects') - if max_redirects: - request_args['max_redirects'] = max_redirects - - # allow_redirects - if 'allow_redirects' in params: - request_args['allow_redirects'] = params['allow_redirects'] - - # soft_max_redirects - soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0) - - # raise_for_status - request_args['raise_for_httperror'] = params.get('raise_for_httperror', True) - - # specific type of request (GET or POST) - if params['method'] == 'GET': - req = searx.network.get - else: - req = searx.network.post - - request_args['data'] = params['data'] - - # send the request - response = req(params['url'], **request_args) - - # check soft limit of the redirect count - if len(response.history) > soft_max_redirects: - # unexpected redirect : record an error - # but the engine might still return valid results. - status_code = str(response.status_code or '') - reason = response.reason_phrase or '' - hostname = response.url.host - count_error( - self.engine_name, - '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects), - (status_code, reason, hostname), - secondary=True, - ) - - return response - - def _search_basic(self, query, params): - # update request parameters dependent on - # search-engine (contained in engines folder) - self.engine.request(query, params) - - # ignoring empty urls - if params['url'] is None: - return None - - if not params['url']: - return None - - # send request - response = self._send_http_request(params) - - # parse the response - response.search_params = params - return self.engine.response(response) - - def search(self, query, params, result_container, start_time, timeout_limit): - # set timeout for all HTTP requests - searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time) - # reset the HTTP total time - searx.network.reset_time_for_thread() - # set the network - searx.network.set_context_network_name(self.engine_name) - - try: - # send requests and parse the results - search_results = self._search_basic(query, params) - self.extend_container(result_container, start_time, search_results) - except ssl.SSLError as e: - # requests timeout (connect or read) - self.handle_exception(result_container, e, suspend=True) - self.logger.error("SSLError {}, verify={}".format(e, searx.network.get_network(self.engine_name).verify)) - except (httpx.TimeoutException, asyncio.TimeoutError) as e: - # requests timeout (connect or read) - self.handle_exception(result_container, e, suspend=True) - self.logger.error( - "HTTP requests timeout (search duration : {0} s, timeout: {1} s) : {2}".format( - default_timer() - start_time, timeout_limit, e.__class__.__name__ - ) - ) - except (httpx.HTTPError, httpx.StreamError) as e: - # other requests exception - self.handle_exception(result_container, e, suspend=True) - self.logger.exception( - "requests exception (search duration : {0} s, timeout: {1} s) : {2}".format( - default_timer() - start_time, timeout_limit, e - ) - ) - except SearxEngineCaptchaException as e: - self.handle_exception(result_container, e, suspend=True) - self.logger.exception('CAPTCHA') - except SearxEngineTooManyRequestsException as e: - self.handle_exception(result_container, e, suspend=True) - self.logger.exception('Too many requests') - except SearxEngineAccessDeniedException as e: - self.handle_exception(result_container, e, suspend=True) - self.logger.exception('Searx is blocked') - except Exception as e: # pylint: disable=broad-except - self.handle_exception(result_container, e) - self.logger.exception('exception : {0}'.format(e)) - - def get_default_tests(self): - tests = {} - - tests['simple'] = { - 'matrix': {'query': ('life', 'computer')}, - 'result_container': ['not_empty'], - } - - if getattr(self.engine, 'paging', False): - tests['paging'] = { - 'matrix': {'query': 'time', 'pageno': (1, 2, 3)}, - 'result_container': ['not_empty'], - 'test': ['unique_results'], - } - if 'general' in self.engine.categories: - # avoid documentation about HTML tags (